您的位置:首页 > 编程语言 > Java开发

java 爬虫Demo

2015-09-23 16:10 387 查看
package com.iminer.crawlers.gsdata;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.UnexpectedPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.iminer.crawlers.CrawlerUtils4MusicAndMovie;

public class Test {

static String url = "http://www.gsdata.cn/index.php/rank/ranks?gid=@GID@&date=2015-09-22&page=1";
static String detailurl = "http://www.gsdata.cn/index.php/rank/single?id=";

public static void main(String[] args) throws Exception {
WebClient webClient = CrawlerUtils4MusicAndMovie.getClient();
String entranceUrl  ="http://www.gsdata.cn/index.php/rank/detail?gid=0";
HtmlPage page = webClient.getPage(entranceUrl);
List<?> byXPath = page.getByXPath("//ul[@class='group-items']/li");
List<String> gids = new ArrayList<String>();

for (Object object : byXPath) {
HtmlElement oElement = (HtmlElement) object;
String attribute = oElement.getAttribute("data-gid");
gids.add(attribute);
}

for (String gid : gids) {
String tempUrl  = url.replace("@GID@", gid);
webClient.addRequestHeader("X-Requested-With", "XMLHttpRequest");
UnexpectedPage spage = webClient.getPage(tempUrl);
JSONObject jsonObject = JSONObject.fromObject(spage.getWebResponse().getContentAsString());
String total = jsonObject.getJSONObject("data").getString("total");
System.out.println(total);
JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("rows");
for (Object object : jsonArray) {
JSONObject jsonObject2 = (JSONObject) object;
String id = jsonObject2.getString("nickname_id");
String temptempurl = detailurl + id;
System.out.println(temptempurl);

//详细页的访问
HtmlPage page2 = webClient.getPage(temptempurl);
List<?> byXPath2 = page2.getByXPath("//li[@class='li_2']");
for (Object object2 : byXPath2) {
HtmlElement element = (HtmlElement) object2;
System.out.println(element.getTextContent());
}

//访问统计数据
//http://www.gsdata.cn/index.php/rank/singleStatistic?id=52

}

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: