使用node.js抓取有路网图书信息(原创)
2014-01-31 18:09
597 查看
之前写过使用python抓取有路网图书信息,见http://www.cnblogs.com/dyf6372/p/3529703.html。
最近想学习一下Node.js,所以想试试手,比较一下http抓取上的性能,采用事件驱动的Node.js比python好一些,以下上代码(刚学还未优化):
var http = require('http'); var iconv = require('iconv-lite'); var url = require('./gb2312_url_encode.js'); function getHtmlOptions(path){ return { hostname : 'www.youlu.net', port : 80, path : path, method : 'GET', headers : { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36' } } } function getAllNumber(str){ var re = /共有图书数量\r\n\s*[0-9]*/; var n_str = str.match(re)[0]; re = /[0-9]{1,}/; return n_str.match(re)[0]; } function getDetailList(str){ var re = /\"\/\d+.*"/g; var n_array = str.match(re); //console.log(n_array); var result_array = []; if(n_array == null){ return []; } for(var i=0;i<n_array.length;i++){ var tmp = n_array[i]; re = /\d+/; var tmp_item = tmp.match(re); //console.log(tmp_item[0]); if(result_array.indexOf(tmp_item[0])<0){ re = /alt=\".*\"/; var n_tmp_item = tmp.match(re); if(n_tmp_item == null) continue; var book_name = n_tmp_item[0].substring(5); book_name = book_name.substring(0,book_name.length-1); result_array.push(tmp_item[0]); searchDetail(tmp_item[0],book_name); } } return result_array; } function getRealBookNumber(str,book_name,url){ re = /startRequestBookBuyLink(.*)/; n_array = str.match(re); var num = n_array[0].split(',')[3]; num = num.substring(2,num.length-1); if(num > 0){ console.log("------------------------------------"); console.log(book_name); console.log("数目:"+num+" url:"+url); } } function firstSearch(query_book_name){ var options = getHtmlOptions('/search/result/default.aspx?isbn=&publisherName=&author=&bookName='+query_book_name); var req = http.request(options,function(res){ if(res.statusCode != 200){ console.log("请求/search/result/default.aspx?isbn=&publisherName=&author=&bookName="+query_book_name+"发生错误了:请求返回非200,返回码:"+res.statusCode+"\n"); return; } var chunks = []; var size = 0; res.on('data',function(chunk){ chunks.push(chunk); size += chunk.length; }); res.on('end',function(){ var returnMsg = Buffer.concat(chunks,size); var nreturnMsg = iconv.decode(returnMsg,'gb2312'); var allNumber = getAllNumber(nreturnMsg); if(allNumber>0){ var allPage = (allNumber/20+0.5).toFixed(0); console.log("该图书数目为:"+allNumber); console.log("共有页数:"+allPage); for(var i = 1; i<=allPage; i++){ searchEachPage(query_book_name,i,allNumber); } }else{ console.log("该图书数目为0."+"\n"); } }); }); req.on('error', function(e) { console.log('problem with request: ' + e.message); }); req.end(); } function searchEachPage(query_book_name,pageIndex,rowCount){ var options = getHtmlOptions("/search/result/?isbn=&publisherName=&author=&bookName="+query_book_name+"&pageIndex="+pageIndex+"&rowCount="+rowCount+"&searchIn="); var req = http.request(options,function(res){ if(res.statusCode != 200){ console.log("请求"+"/search/result/?isbn=&publisherName=&author=&bookName="+query_book_name+"&pageIndex="+pageIndex+"&rowCount="+rowCount+"&searchIn="+"发生错误了:请求返回非200,返回码:"+res.statusCode+"\n"); return; } var chunks = []; var size = 0; res.on('data',function(chunk){ chunks.push(chunk); size += chunk.length; }); res.on('end',function(){ var returnMsg = Buffer.concat(chunks,size); var nreturnMsg = iconv.decode(returnMsg,'gb2312'); getDetailList(nreturnMsg); }); }); req.on('error', function(e) { console.log('problem with request: ' + e.message); }); req.end(); } function searchDetail(detail_number,book_name){ var options = getHtmlOptions("/"+detail_number); var req = http.request(options,function(res){ if(res.statusCode != 200){ console.log("请求"+"/"+detail_number+"发生错误了:请求返回非200,返回码:"+res.statusCode+"\n"); return; } var chunks = []; var size = 0; res.on('data',function(chunk){ chunks.push(chunk); size += chunk.length; }); res.on('end',function(){ var returnMsg = Buffer.concat(chunks,size); var nreturnMsg = iconv.decode(returnMsg,'gb2312'); getRealBookNumber(nreturnMsg,book_name,"http://www.youlu.net/"+detail_number); }); }); req.on('error', function(e) { console.log('problem with request: ' + e.message); }); req.end(); } var query_book_name= url.URLEncode('java'); firstSearch(query_book_name);
相关文章推荐
- 使用python抓取有路网图书信息(原创)
- Node.js 切近实战(三) 之图书管理系统(图书信息录入)
- [js高手之路]Node.js实现简易的爬虫-抓取博客所有文章列表信息
- NodeJS + PhantomJS 抓取页面信息以及截图
- [js高手之路]Node.js实现简易的爬虫-抓取博客所有文章列表信息
- (原创)node.js入门之二:mysql的使用-Mac环境开发
- 使用node.js 获取客户端信息代码分享
- NodeJS + PhantomJS 抓取页面信息以及截图
- 使用node.js 获取客户端信息代码分享
- node.js学习笔记——学生信息管理的实现(把功能模块化)使用官方提供的http模块实现
- Node.js学习之网络爬虫(使用cheerio抓取网页数据)
- NodeJS + PhantomJS 抓取页面信息以及截图
- 使用node.js cheerio抓取网页数据
- NodeJS + PhantomJS 抓取页面信息以及截图
- 使用node.js cheerio抓取网页数据
- 使用node.js的Crypto模块Hmac算法对信息进行认证
- Node.js抓取网页信息并展示(cheerio网络爬虫)
- 使用Jsoup抓取京东图书分类页面图书信息
- nodejs + request + cheerio 抓取页面指定的信息
- 使用htmlparser简单抓取京东图书信息存入数据库的小例子