您的位置:首页 > 其它

金融界货币型基金净值采集爬虫开发教程

2018-03-15 10:53 381 查看
金融界的货币型基金数据比较多,基金数据的参数项比较全,参考价值高。

本文再分享一篇用JavaScript采集“金融界货币型基金净值采集”的源码,并做简单的解析。先看源码

/**
金融界货币型基金净值采集爬虫源码
建议给爬虫配置代理IP,可有效解决金融界网站的反爬问题
**/

var configs = {
domains: ["fund.jrj.com.cn"],
contentUrlRegexes: [
/http[\w:\/]+fund\.jrj\.com\.cn\/.*/
],
helperUrlRegexes: [""],
interval: 10000,
timeout: 60000,
autoFindUrls: false,
fields: [
{
name: "infos",
selectorType: SelectorType.JsonPath,
selector: "$.currencyFundNetValueList",
repeated: true,
children: [
{
name: "fundCode",
alias: "基金代码",
selectorType: SelectorType.JsonPath,
selector: "$.fundCode"
},
{
name: "fundSName",
alias: "基金简称",
selectorType: SelectorType.JsonPath,
selector: "$.fundSName"
},
{
name: "tenthouUnitIncm",
alias: "(今日)每万份基金净收益",
selectorType: SelectorType.JsonPath,
selector: "$.tenthouUnitIncm"
},
{
name: "yearYld",
alias: "(今日)7日年化收益率",
selectorType: SelectorType.JsonPath,
selector: "$.yearYld"
},
{
name: "tenthouUnitIncmPre",
alias: "(昨日)每万份基金净收益",
selectorType: SelectorType.JsonPath,
selector: "$.tenthouUnitIncmPre"
},
{
name: "yearYldPre",
alias: "(昨日)7日年化收益率",
selectorType: SelectorType.JsonPath,
selector: "$.yearYldPre"
},
{
name: "initEstabDate",
alias: "成立日期",
selectorType: SelectorType.JsonPath,
selector: "$.initEstabDate"
},
{
name: "fundManager",
alias: "基金经理",
selectorType: SelectorType.JsonPath,
selector: "$.fundManager"
},
{
name: "innerCode",
alias: "innerCode",
selectorType: SelectorType.JsonPath,
selector: "$.innerCode"
},
{
name: "manaCode",
alias: "manaCode",
selectorType: SelectorType.JsonPath,
selector: "$.manaCode"
}
]
}
]
};

configs.initCrawl = function(site) {
var scanUrl = "http://fund.jrj.com.cn/json/netvaluelist/currency?&manaCode=0&pageSize=20¤tPage=1&sortType=13&order=1&obj=netvaluelist&_=" + new Date().getTime();
site.addScanUrl(scanUrl);
};

configs.afterDownloadPage = function(page, site) {
msleep(10000);
if (!page.raw) return page;
var m = /netvaluelist=(\{.*\})/.exec(page.raw);
if (m && m[1]) {
var jsonData = m[1];
try {
jsonData = JSON.parse(jsonData);
var currencyFundNetValueList = jsonData.currencyFundNetValueList;
var datas = [];
for (var c in currencyFundNetValueList) {
var data = currencyFundNetValueList[c];
data = data.replace(/\\"/g, '"');
datas.push(JSON.parse(data));
}
jsonData.currencyFundNetValueList = datas;
page.raw = JSON.stringify(jsonData);
}
catch (err) {
console.log("Failed to parse page.raw! err: " + err);
return page;
}
}
return page;
};

configs.onProcessContentPage = function(page, content, site) {
var jsonData = "";
try {
jsonData = JSON.parse(page.raw);
}
catch (err) {
console.log("Failed to parse page.raw! err: " + err);
return false;
}
var curPageNum = parseInt(jsonData.currentPage);
var totalPageNum = parseInt(jsonData.pageNum);
if (curPageNum >= totalPageNum) return false;
var nextUrl = "http://fund.jrj.com.cn/json/netvaluelist/currency?&manaCode=0&pageSize=20¤tPage=" + (curPageNum+1) + "&sortType=13&order=1&obj=netvaluelist&_=" + new Date().getTime();
site.addUrl(nextUrl);
return false;
};

configs.afterExtractField = function(fieldName, data, page, site) {
if (!data) {
return data;
}
if (fieldName == "infos.yearYld" || fieldName == "infos.yearYldPre") {
return data + "%";
}
return data;
};

var crawler = new Crawler(configs);
crawler.start();


本文使用Chrome浏览器分析金融界货币型基金净值的网页,按“F12”可打开浏览器“开发者工具”,对网页请求进行仔细分析。



通过查看网页源码并分析,我们不难得出一个结论:所有基金数据都是通过AJAX获取到的



按“F5”刷新网页,在浏览器开发者工具“Network”中寻找获取基金数据的js请求即可。



总结:爬虫不难,重点在于分析出“基金数据是如何获取到的”。网上有专门支持JavaScript开发的爬虫平台(比如,神箭手大数据平台等),直接复制并运行代码即可。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: