数据爬取:爬取淘宝及国美在线搜索建议词
2016-11-30 19:06
288 查看
分为两种形式的抓取:
1.基于首字母的四轮次抓取 如: a ,aa ,aaa,aaaa
2.基于汉语i拼音的三轮次抓取:附拼音表
链接:http://pan.baidu.com/s/1eS5Kdmq 密码:n9pb
使用的框架为webmagic
淘宝爬取:
国美抓取:
1.基于首字母的四轮次抓取 如: a ,aa ,aaa,aaaa
2.基于汉语i拼音的三轮次抓取:附拼音表
链接:http://pan.baidu.com/s/1eS5Kdmq 密码:n9pb
使用的框架为webmagic
淘宝爬取:
public class TaobaoPinyinSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(10); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString()); for (Object aa : a) { String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); try { IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(a.size()>9){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length(); String query = url.substring(index, url.length()); char[] chars = query.toCharArray(); int num = 0; for (char c : chars) { if(c>=65 && c <= 90){ num++; } } if(num <3){ List<String> temp = new ArrayList<String>(); for (String add : speeeds) { temp.add(url+add); } page.addTargetRequests(temp); } } } @Override public Site getSite() { return site; } private static String fileName ; private static List<String> speeeds; public static void main(String[] args) throws IOException { if(args.length>1 && args.length<3){ fileName = args[0]; speeeds = FileUtils.readLines(new File(args[1])); }else{ fileName = "E:\\temp\\temp_pinyin_suggest.txt"; speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt")); } long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q; Spider.create(new TaobaoPinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); } }
public class TaobaoSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(100); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString()); for (Object aa : a) { String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); try { IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(a.size()>9){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length(); String query = url.substring(index, url.length()); if(query.length() <4){ String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; List<String> temp = new ArrayList<String>(); for (String add : speeeds) { temp.add(url+add); } page.addTargetRequests(temp); } } } @Override public Site getSite() { return site; } private static String fileName ; public static void main(String[] args) throws IOException { if(args.length>0 && args.length<2){ fileName = args[0]; }else{ fileName = "E:\\temp\\temp_tb_suggest.txt"; } String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q; Spider.create(new TaobaoSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); } }
国美抓取:
public class GomeSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(100); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(page.getJson().toString()); ArrayList<String> temp = new ArrayList<String>(); for (Object aa : a) { if(aa.toString().indexOf("{\"cat\":")==-1){ String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); temp.add(substring); } } for (int i = 0; i < temp.size(); i++) { try { IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(temp.size()>8){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length(); String query = url.substring(index, url.length()); if(query.length() <4){ String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; List<String> temps = new ArrayList<String>(); for (String add : speeeds) { temps.add(url+add); } page.addTargetRequests(temps); } } } @Override public Site getSite() { return site; } private static String fileName ; public static void main(String[] args) throws IOException { if(args.length>0 && args.length<2){ fileName = args[0]; }else{ fileName = "E:\\temp\\temp_gome_suggest.txt"; } String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a", "s","d","f","g","h","j","k","l", "z","x","c","v","b","n","m"}; long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q; Spider.create(new GomeSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(7).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); } }
public class GomePinyinSuggestWordPageProcessor implements PageProcessor{ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site. me(). setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss"). setRetryTimes(5).setSleepTime(50); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //["童装","40827952"] JSONArray a = JSON.parseArray(page.getJson().toString()); ArrayList<String> temp = new ArrayList<String>(); for (Object aa : a) { if(aa.toString().indexOf("{\"cat\":")==-1){ String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", ""); String substring = replace.substring(0,replace.indexOf(",")); temp.add(substring); } } for (int i = 0; i < temp.size(); i++) { try { IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true)); IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } if(temp.size()>8){ String url = page.getUrl().toString(); //https://suggest.taobao.com/sug?code=utf-8&q= int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length(); String query = url.substring(index, url.length()); char[] chars = query.toCharArray(); int num = 0; for (char c : chars) { if(c>=65 && c <= 90){ num++; } } if(num <3){ List<String> temps = new ArrayList<String>(); for (String add : speeeds) { temps.add(url+add); } page.addTargetRequests(temps); } } } @Override public Site getSite() { return site; } private static String fileName ; private static List<String> speeeds; public static void main(String[] args) throws IOException { if(args.length>1 && args.length<3){ fileName = args[0]; speeeds = FileUtils.readLines(new File(args[1])); }else{ fileName = "E:\\temp\\temp_pinyin_gome_suggest.txt"; speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt")); } long one = System.currentTimeMillis(); for (String q : speeeds) { long temp = System.currentTimeMillis(); String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q; Spider.create(new GomePinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url). thread(5).run(); System.out.println("the speeed is : "+q+" end and time is :" + (System.currentTimeMillis() -temp) + " ms"); } long two = System.currentTimeMillis(); System.out.println("one end and time is :" + (two -one) + " ms"); } }
相关文章推荐
- Expression构建DataTable to Entity 映射委托 sqlserver 数据库里面金额类型为什么不建议用float,实例告诉你为什么不能。 sql server 多行数据合并成一列 C# 字符串大写转小写,小写转大写,数字保留,其他除外 从0开始用U盘制作启动盘装Windows10系统(联想R720笔记本)并永久激活方法 纯CSS打造淘宝导航菜单栏 C# Winform
- 《程序员的第一年》---------- 【抓取网页数据】定时查寻淘宝搜索结果并用excel记录下来(HttpWebRequest与正则等的使用)
- 1.MVC框架复习 2.Ajax加强 3.搜索建议 4,三级联动 5、刷新分页 6、Ajax局部动态更新数据
- 1.MVC框架复习 2.Ajax加强 3.搜索建议 4,三级联动 5、刷新分页 6、Ajax局部动态更新数据
- DataSet接收XML数据并按条件搜索
- 构、搜索、性能等技术角度分析了电子商务网站重点要解决的几个问题,并给出一些建议和方案
- ASP ACCESS 数据高级搜索 By Stabx
- [导入]ASP ACCESS 数据高级搜索 By Stabx
- 从sql server中随即搜索出几条数据
- 建议dudu为博客园加上MSDN资源库搜索功能
- DataSet接收XML数据并按条件搜索
- 全网软件:高级信息搜索数据采集软件 InfoSeek & FastSeek
- Google首页新增“搜索建议”
- (原创)一组基于Lucene的cache和非cache搜索测试数据
- 为了让搜索引擎更好地搜索到你的网站,建议在中加入:
- win2000进程虚拟内存数据搜索与修改
- 地图图层所应包含的数据表(建议)
- 近日探得用C++将二进制数据存储到XML文件的方法,因在研究时,未得到网上其他同仁的帮助(网上搜索了半天没有相关资料,只有DOTNET的),在这里不敢独享,给别的同仁在搜索时能够搜索到相关资料,也算是绵薄之力! )
- C#根据条件搜索Infopath生成的XML数据