爬虫代码实现八-多线程爬虫
2017-01-19 13:05
411 查看
package com.dajiangtai.djt_spider.start;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.service.IRepositoryService;
import com.dajiangtai.djt_spider.service.IStoreService;
import com.dajiangtai.djt_spider.service.impl.ConsoleStoreService;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
import com.dajiangtai.djt_spider.service.impl.QueueRepositoryService;
import com.dajiangtai.djt_spider.service.impl.YOUKUProcessService1;
import com.dajiangtai.djt_spider.util.LoadPropertyUtil;
import com.dajiangtai.djt_spider.util.ThreadUtil;
/**
* 电视剧爬虫入口类
* @author Administrator
*
*/
public class StartDSJCount {
//页面下载接口
private IDownLoadService downLoadService;
//页面解析接口
private IProcessService processService;
//数据存储接口
private IStoreService storeService;
private IRepositoryService repositoryService;
//固定线程池
private ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(Integer.parseInt(LoadPropertyUtil.getConfig("threadNum")));
//并发执行队列,这里只有一个队列,没有优先级,因此需要对它优化,注释这一行,换成新的接口实现方法
//private Queue<String> urlQueue = new ConcurrentLinkedDeque<String>();
public static void main(String[] args) {
StartDSJCount dsj = new StartDSJCount();
dsj.setDownLoadService(new HttpClientDownLoadService());
dsj.setProcessService(new YOUKUProcessService1());
dsj.setStoreService(new ConsoleStoreService());
dsj.setRepositoryService(new QueueRepositoryService());
//详情页面url
// String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//列表页面url
String url = "http://tv.youku.com/search/index/_page40177_comdid_40177";
// //下载页面
// Page page = dsj.downloadPage(url);
// dsj.processPage(page);
// //存储页面信息
// dsj.storePageInfo(page);
//设置起始的url
// dsj.urlQueue.add(url);
//将起始的url放入高优先级队列中,起始url为列表url
dsj.repositoryService.addHighLevel(url);
//开启爬虫
dsj.startSpider();
}
//开启一个爬虫入口
public void startSpider(){
//循环抓取
while(true){
//从队列中提取需要解析的url
//String url = urlQueue.poll();
//保证了先从高优先级中取,再从低优先级队列中取
final String url = repositoryService.poll();
//判断url是否为空
if(StringUtils.isNotBlank(url)){
newFixedThreadPool.execute(new Runnable(){
public void run() {
System.out.println("当前第"+Thread.currentThread().getId()+"个线程");
//下载
Page page = StartDSJCount.this.downloadPage(url);
//解析
StartDSJCount.this.processPage(page);
//解析后将urlList中的url分别取出来并且放入队列中
List<String> urlList = page.getUrlList();
for(String eachurl:urlList){
//this.urlQueue.add(eachurl);
//如果是列表url,加入到高优先级队列中
if(eachurl.startsWith("http://tv.youku.com/search/index")){
StartDSJCount.this.repositoryService.addHighLevel(eachurl);
}else{
//如果是详情页面url,加到低优先级队列中
StartDSJCount.this.repositoryService.addLowLevel(eachurl);
}
}
//page.getUrl()表示当前页,当前页如果是详情页,则存储数据
if(page.getUrl().startsWith("http://www.youku.com/show_page")){
//存储数据
StartDSJCount.this.storePageInfo(page);
}
//让线程解析下休息一下,降低网站访问频率
ThreadUtil.sleep(Long.parseLong(LoadPropertyUtil.getConfig("millions_3")));
}
});
}else{
System.out.println("队列中的电视剧url解析完毕,请等待!");
ThreadUtil.sleep(Long.parseLong(LoadPropertyUtil.getConfig("millions_5")));
}
}
}
//下载页面方法
public Page downloadPage(String url){
return this.downLoadService.download(url);
}
//解析页面方法
public void processPage(Page page){
this.processService.process(page);
}
//存储页面信息方法
public void storePageInfo(Page page){
this.storeService.store(page);
}
public IDownLoadService getDownLoadService() {
return downLoadService;
}
public void setDownLoadService(IDownLoadService downLoadService) {
this.downLoadService = downLoadService;
}
public IProcessService getProcessService() {
return processService;
}
public void setProcessService(IProcessService processService) {
this.processService = processService;
}
public IStoreService getStoreService() {
return storeService;
}
public void setStoreService(IStoreService storeService) {
this.storeService = storeService;
}
public IRepositoryService getRepositoryService() {
return repositoryService;
}
public void setRepositoryService(IRepositoryService repositoryService) {
this.repositoryService = repositoryService;
}
}
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.service.IRepositoryService;
import com.dajiangtai.djt_spider.service.IStoreService;
import com.dajiangtai.djt_spider.service.impl.ConsoleStoreService;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
import com.dajiangtai.djt_spider.service.impl.QueueRepositoryService;
import com.dajiangtai.djt_spider.service.impl.YOUKUProcessService1;
import com.dajiangtai.djt_spider.util.LoadPropertyUtil;
import com.dajiangtai.djt_spider.util.ThreadUtil;
/**
* 电视剧爬虫入口类
* @author Administrator
*
*/
public class StartDSJCount {
//页面下载接口
private IDownLoadService downLoadService;
//页面解析接口
private IProcessService processService;
//数据存储接口
private IStoreService storeService;
private IRepositoryService repositoryService;
//固定线程池
private ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(Integer.parseInt(LoadPropertyUtil.getConfig("threadNum")));
//并发执行队列,这里只有一个队列,没有优先级,因此需要对它优化,注释这一行,换成新的接口实现方法
//private Queue<String> urlQueue = new ConcurrentLinkedDeque<String>();
public static void main(String[] args) {
StartDSJCount dsj = new StartDSJCount();
dsj.setDownLoadService(new HttpClientDownLoadService());
dsj.setProcessService(new YOUKUProcessService1());
dsj.setStoreService(new ConsoleStoreService());
dsj.setRepositoryService(new QueueRepositoryService());
//详情页面url
// String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//列表页面url
String url = "http://tv.youku.com/search/index/_page40177_comdid_40177";
// //下载页面
// Page page = dsj.downloadPage(url);
// dsj.processPage(page);
// //存储页面信息
// dsj.storePageInfo(page);
//设置起始的url
// dsj.urlQueue.add(url);
//将起始的url放入高优先级队列中,起始url为列表url
dsj.repositoryService.addHighLevel(url);
//开启爬虫
dsj.startSpider();
}
//开启一个爬虫入口
public void startSpider(){
//循环抓取
while(true){
//从队列中提取需要解析的url
//String url = urlQueue.poll();
//保证了先从高优先级中取,再从低优先级队列中取
final String url = repositoryService.poll();
//判断url是否为空
if(StringUtils.isNotBlank(url)){
newFixedThreadPool.execute(new Runnable(){
public void run() {
System.out.println("当前第"+Thread.currentThread().getId()+"个线程");
//下载
Page page = StartDSJCount.this.downloadPage(url);
//解析
StartDSJCount.this.processPage(page);
//解析后将urlList中的url分别取出来并且放入队列中
List<String> urlList = page.getUrlList();
for(String eachurl:urlList){
//this.urlQueue.add(eachurl);
//如果是列表url,加入到高优先级队列中
if(eachurl.startsWith("http://tv.youku.com/search/index")){
StartDSJCount.this.repositoryService.addHighLevel(eachurl);
}else{
//如果是详情页面url,加到低优先级队列中
StartDSJCount.this.repositoryService.addLowLevel(eachurl);
}
}
//page.getUrl()表示当前页,当前页如果是详情页,则存储数据
if(page.getUrl().startsWith("http://www.youku.com/show_page")){
//存储数据
StartDSJCount.this.storePageInfo(page);
}
//让线程解析下休息一下,降低网站访问频率
ThreadUtil.sleep(Long.parseLong(LoadPropertyUtil.getConfig("millions_3")));
}
});
}else{
System.out.println("队列中的电视剧url解析完毕,请等待!");
ThreadUtil.sleep(Long.parseLong(LoadPropertyUtil.getConfig("millions_5")));
}
}
}
//下载页面方法
public Page downloadPage(String url){
return this.downLoadService.download(url);
}
//解析页面方法
public void processPage(Page page){
this.processService.process(page);
}
//存储页面信息方法
public void storePageInfo(Page page){
this.storeService.store(page);
}
public IDownLoadService getDownLoadService() {
return downLoadService;
}
public void setDownLoadService(IDownLoadService downLoadService) {
this.downLoadService = downLoadService;
}
public IProcessService getProcessService() {
return processService;
}
public void setProcessService(IProcessService processService) {
this.processService = processService;
}
public IStoreService getStoreService() {
return storeService;
}
public void setStoreService(IStoreService storeService) {
this.storeService = storeService;
}
public IRepositoryService getRepositoryService() {
return repositoryService;
}
public void setRepositoryService(IRepositoryService repositoryService) {
this.repositoryService = repositoryService;
}
}
相关文章推荐
- python爬虫:短代码实现多线程爬虫
- python网络爬虫——基本概念及代码实现1
- [搜片神器]之DHT网络爬虫的代码实现方法
- python实现爬虫统计学校BBS男女比例之多线程爬虫(二)
- Python多线程、异步+多进程爬虫实现代码
- Python多线程、异步+多进程爬虫实现代码
- 实现蜘蛛捕捉的PHP代码 (WEB爬虫)
- python实现简单爬虫功能代码
- 【单机版】一个小爬虫+PageRank代码实现
- JAVA多线程网络爬虫的代码实现 推荐
- java爬虫学习日记2-宽度优先爬虫代码实现
- python3简单爬虫实现代码
- Python 实现网络爬虫 抓取静态网页【代码】
- Python实现爬取知乎神回复简单爬虫代码分享
- 基于Java HttpClient和Htmlparser实现网络爬虫代码
- Python天气预报采集器实现代码(网页爬虫)
- 网络爬虫讲解及java代码实现
- 小说爬虫之JAVA代码的实现(附代码)
- python实现爬虫统计学校BBS男女比例之多线程爬虫(二)
- java爬虫学习日记2-宽度优先爬虫代码实现