GO语言_用redis作为url队列的爬虫
2017-05-02 14:36
387 查看
// Copyright 2016 laosj Author @songtianyi. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "github.com/songtianyi/laosj/downloader" "github.com/songtianyi/laosj/spider" "github.com/songtianyi/rrframework/connector/redis" "github.com/songtianyi/rrframework/logs" "github.com/songtianyi/rrframework/storage" "regexp" "strconv" "sync" ) func main() { d := &downloader.Downloader{ ConcurrencyLimit: 10, UrlChannelFactor: 10, RedisConnStr: "127.0.0.1:6379", SourceQueue: "DATA:IMAGE:MZITU:XINGGAN", Store: rrstorage.CreateLocalDiskStorage("/Users/deer_mac/Desktop/自拍/"), } go func() { d.Start() }() // step1: find total pages s, err := spider.CreateSpiderFromUrl("http://www.mzitu.com/share") if err != nil { logs.Error(err) return } rs, _ := s.GetText("div.main>div.main-content>div.postlist>div>div.pagenavi-cm>a") max := spider.FindMaxFromSliceString(1, rs) // step2: for every page, find all img tags var wg sync.WaitGroup var mu sync.Mutex step2 := make([]string, 0) for i := 1; i <= max; i++ { wg.Add(1) go func(ix int) { defer wg.Done() ns, err := spider.CreateSpiderFromUrl(s.Url + "/comment-page-" + strconv.Itoa(ix) + "#comments/") if err != nil { logs.Error(err) return } t, _ := ns.GetHtml("div.main>div.main-content>div.postlist>div>ul>li>div>p") mu.Lock() step2 = append(step2, t...) mu.Unlock() }(i) } wg.Wait() err, rc := rrredis.GetRedisClient(d.RedisConnStr) if err != nil { logs.Error(err) return } // parse url for _, v := range step2 { re := regexp.MustCompile("src=\"(\\S+)\"") url := re.FindStringSubmatch(v)[1] key := d.SourceQueue if _, err := rc.RPush(key, url); err != nil { logs.Error(err) return } } d.WaitCloser() }
首先要开启redis服务, 然后就可以了.
相关文章推荐
- Go语言 爬虫1-网络请求
- golang--Redis最佳的Go语言驱动
- go语言实现一个简单的http客户端抓取远程url的方法
- 实现一个go语言的简单爬虫来爬取CSDN博文(一)
- 使用GO语言开发 Redis数据监控程序
- 网络爬虫中,URL队列(URL Frontier)的设计与实现
- golang--Redis最佳的Go语言驱动
- Go语言操作redis用法实例
- redis作为消息队列的使用
- Go语言开发的网站模板爬虫 Lea Web Template Spider
- [转]Redis作为消息队列与RabbitMQ的性能对比
- Go语言 简单的爬虫示例(2)——编码转换
- go语言实现爬虫采集联想词
- CrawlUrl --- 使用Berkeley DB爬虫队列实例
- Go语言 爬虫2-编码转换
- 开源JAVA爬虫crawler4j源码分析 - 4 URL管理、URL队列
- Redis作为消息队列服务场景应用案例
- go语言函数作为参数传递
- 如何在 Go 语言中使用 Redis 连接池-Radix.v2