java 爬虫Demo
2015-09-23 16:10
387 查看
package com.iminer.crawlers.gsdata; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.UnexpectedPage; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.iminer.crawlers.CrawlerUtils4MusicAndMovie; public class Test { static String url = "http://www.gsdata.cn/index.php/rank/ranks?gid=@GID@&date=2015-09-22&page=1"; static String detailurl = "http://www.gsdata.cn/index.php/rank/single?id="; public static void main(String[] args) throws Exception { WebClient webClient = CrawlerUtils4MusicAndMovie.getClient(); String entranceUrl ="http://www.gsdata.cn/index.php/rank/detail?gid=0"; HtmlPage page = webClient.getPage(entranceUrl); List<?> byXPath = page.getByXPath("//ul[@class='group-items']/li"); List<String> gids = new ArrayList<String>(); for (Object object : byXPath) { HtmlElement oElement = (HtmlElement) object; String attribute = oElement.getAttribute("data-gid"); gids.add(attribute); } for (String gid : gids) { String tempUrl = url.replace("@GID@", gid); webClient.addRequestHeader("X-Requested-With", "XMLHttpRequest"); UnexpectedPage spage = webClient.getPage(tempUrl); JSONObject jsonObject = JSONObject.fromObject(spage.getWebResponse().getContentAsString()); String total = jsonObject.getJSONObject("data").getString("total"); System.out.println(total); JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("rows"); for (Object object : jsonArray) { JSONObject jsonObject2 = (JSONObject) object; String id = jsonObject2.getString("nickname_id"); String temptempurl = detailurl + id; System.out.println(temptempurl); //详细页的访问 HtmlPage page2 = webClient.getPage(temptempurl); List<?> byXPath2 = page2.getByXPath("//li[@class='li_2']"); for (Object object2 : byXPath2) { HtmlElement element = (HtmlElement) object2; System.out.println(element.getTextContent()); } //访问统计数据 //http://www.gsdata.cn/index.php/rank/singleStatistic?id=52 } } } }
相关文章推荐
- spring容器对象的生命周期
- Java遍历解析URL类型字符串中参数
- JavaBean使用示例
- spring中bean的scope
- spring容器创建对象的时机
- Java-DES算法加密解密工具类
- Java IO流的学习(1)
- Eclipse TypeScript 安装
- spring容器创建对象的3种方式(bean的实例化)
- JDK的安装与设置
- java 堆、栈与数据类型
- Eclipse相关设置
- Springmvc+mybatis 问题总结(续ing)
- java学习笔记之WeakHashMap 、IdentityHashMap、EnumMap
- Java正则表达式入门
- javaEE__sevlet
- test5.21
- Java IO流
- C4.5决策树--Java
- Java Session