您的位置:首页 > 数据库 > Redis

GO语言_用redis作为url队列的爬虫

2017-05-02 14:36 387 查看
// Copyright 2016 laosj Author @songtianyi. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0 //
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"github.com/songtianyi/laosj/downloader"
"github.com/songtianyi/laosj/spider"
"github.com/songtianyi/rrframework/connector/redis"
"github.com/songtianyi/rrframework/logs"
"github.com/songtianyi/rrframework/storage"
"regexp"
"strconv"
"sync"
)

func main() {
d := &downloader.Downloader{
ConcurrencyLimit: 10,
UrlChannelFactor: 10,
RedisConnStr:     "127.0.0.1:6379",
SourceQueue:      "DATA:IMAGE:MZITU:XINGGAN",
Store:            rrstorage.CreateLocalDiskStorage("/Users/deer_mac/Desktop/自拍/"),
}
go func() {
d.Start()
}()

// step1: find total pages
s, err := spider.CreateSpiderFromUrl("http://www.mzitu.com/share")
if err != nil {
logs.Error(err)
return
}
rs, _ := s.GetText("div.main>div.main-content>div.postlist>div>div.pagenavi-cm>a")
max := spider.FindMaxFromSliceString(1, rs)

// step2: for every page, find all img tags
var wg sync.WaitGroup
var mu sync.Mutex
step2 := make([]string, 0)
for i := 1; i <= max; i++ {
wg.Add(1)
go func(ix int) {
defer wg.Done()
ns, err := spider.CreateSpiderFromUrl(s.Url + "/comment-page-" + strconv.Itoa(ix) + "#comments/")
if err != nil {
logs.Error(err)
return
}
t, _ := ns.GetHtml("div.main>div.main-content>div.postlist>div>ul>li>div>p")
mu.Lock()
step2 = append(step2, t...)
mu.Unlock()
}(i)
}
wg.Wait()
err, rc := rrredis.GetRedisClient(d.RedisConnStr)
if err != nil {
logs.Error(err)
return
}
// parse url
for _, v := range step2 {
re := regexp.MustCompile("src=\"(\\S+)\"")
url := re.FindStringSubmatch(v)[1]
key := d.SourceQueue
if _, err := rc.RPush(key, url); err != nil {
logs.Error(err)
return
}
}
d.WaitCloser()
}


首先要开启redis服务, 然后就可以了.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: