您的位置:首页 > Web前端 > Node.js

nodejs 采集新闻数据

2014-02-10 09:37 211 查看
使用nodejs采集新华新闻数据

代码段:

var express = require('express');
var $ = require('jQuery');
var app = express();
var colors = require('colors');
var message_list = require('./zui/message_list');
var findData = require('./findData');

//设置全局跨域访问
app.all('*', function(req, res, next) {
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept");
res.header("Access-Control-Allow-Methods","PUT,POST,GET,DELETE,OPTIONS");
res.header("X-Powered-By",' 3.2.1')
res.header("Content-Type", "application/json;charset=utf-8");
next();
});

//输出主页列表数据
app.get('/getIndexData', function (req, res) {
findData.indexData(req, res);
});

/*
*输出列表数据。支持范围选择
*demo:http://localhost:3000/getList/0,9 前9条数据 从0开始
*demo: http://localhost:3000/getList/0  所有的数据  从0开始
*/
app.get(/^\/getList?(?:\/(\d+)(?:\,(\d+))?)?/, function (req, res) {
var start=undefined,end=undefined;
if(req.params.length==1){
start=req.params[0];
}else if(req.params.length==2){
start=req.params[0];
end=req.params[1];
}
findData.list(req, res,start,end,function(data){
for(var i=0,j=data.length;i<j;i++){
data[i].orderby="1";
}
return data;
});
});

//数据列表
var newsListData=undefined;
app.get('/getNewsList', function (req, res) {
if(typeof newsListData=="undefined"){
findData.list(req, res,undefined,undefined,function(data,length){
for(var i=0,j=data.length;i<j;i++){
data[i].orderby=Math.round(Math.random()*6);
data[i].summary="暂无摘要";
data[i].updateUserName="admin";

}
newsListData=data;
getNewsList(req, res);
});
}else{
getNewsList(req, res);
}
});

//缓存数据列表
function getNewsList(req, res){
var pageNum=req.query.pageNum,pageSize=req.query.pageSize;
var start=(pageNum-1)*pageSize,end=pageNum*pageSize;
var newData=newsListData.slice(start, end);
res.send(JSON.stringify(newData));
}

app.listen(3000);
console.log("nodejs was start".green);


var http = require('http'),
fs = require('fs'),
jquery = fs.readFileSync("lib/jquery.min.js", "utf-8"),
jsdom = require('jsdom');

function get(url, callback) {
//使用代理
var opt = {
host: '127.0.0.1',
port: '7070',
method: 'get', //这里是发送的方法
path: url
}
//以下是接受数据的代码
var req = http.request(opt, function(res) {
res.setEncoding('utf8');
var html = '';
res.on('data', function(d) {
html += d;
}).on('end', function() {
jsdom.env({
html: html,
src: [jquery],
done: function(errors, window) {
var $ = window.$;
callback && callback(errors, $);
window.close(); // 释放window相关资源,否则将会占用很高的内存
}
});
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
})
req.end();
}

//导出list
exports.indexData = function(request, response, start, end) {
get('http://www.news.cn/edu/index.htm', function(errors, $) {
var $list = $(".list");
var data = [];
$list.each(function(index) {
var $this = $(this),
$li = $this.find("li"),
li_data = [];
$li.each(function(index) {
var $a = $(this).find("a");
var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/","");
var temp_href="/"+href.replace(/[^c]*/,"");
var date=href.replace(temp_href,"").replace("/","-");
if(date.length>10){
date="2013-01-04";
}

li_data.push({
title: $a.text(),
href: $a.attr("href"),
date:date,
id: index
});
});
data.push({
id: index,
list: li_data
});
});

if (typeof start != "undefined" && typeof end == "undefined") {
data = data.slice(start);
} else if (typeof start != "undefined" && typeof end != "undefined") {
data = data.slice(start, end);
}
response.send(JSON.stringify(data));
});
}

//导出list
exports.list = function(request, response, start, end,callback) {
get('http://www.news.cn/edu/index.htm', function(errors, $) {
var $li = $(".list li");
var data = [];
$li.each(function(index) {
var $a = $(this).find("a");
var href=$a.attr("href").replace("http://news.xinhuanet.com/edu/","");
var temp_href="/"+href.replace(/[^c]*/,"");
var date=href.replace(temp_href,"").replace("/","-");
if(date.length>10){
date="2013-01-04";
}

data.push({
id: index,
title: $a.text(),
attachment: $a.attr("href"),
updateTime:date
});
});

var newData=data;
if (typeof start != "undefined" && typeof end == "undefined") {
newData = data.slice(start);
} else if (typeof start != "undefined" && typeof end != "undefined") {
newData = data.slice(start, end);
}
if(typeof callback=="function"){
newData=callback(newData,data.length);
}
//response.header("Access-Control-Allow-Origin", "*");   //设置跨域访问
response.send(JSON.stringify(newData));
});
}


访问地址

http://localhost:3000/getIndexData http://localhost:3000/getList/0,4 http://localhost:3000/getNewsList?pageNum=1&pageSize=10


over
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: