您的位置:首页 > 其它

数据爬取:爬取淘宝及国美在线搜索建议词

2016-11-30 19:06 288 查看
分为两种形式的抓取:
1.基于首字母的四轮次抓取 如: a ,aa ,aaa,aaaa
2.基于汉语i拼音的三轮次抓取:附拼音表

链接:http://pan.baidu.com/s/1eS5Kdmq 密码:n9pb

使用的框架为webmagic

淘宝爬取:
public class TaobaoPinyinSuggestWordPageProcessor implements PageProcessor{

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.
me().
setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").
setRetryTimes(5).setSleepTime(10);

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//["童装","40827952"]
JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString());
for (Object aa : a) {
String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");
String substring = replace.substring(0,replace.indexOf(","));
try {
IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true));
IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

if(a.size()>9){
String url = page.getUrl().toString();
//https://suggest.taobao.com/sug?code=utf-8&q=
int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length();
String query = url.substring(index, url.length());
char[] chars = query.toCharArray();
int num = 0;
for (char c : chars) {
if(c>=65 && c <= 90){
num++;
}
}
if(num <3){
List<String> temp = new ArrayList<String>();
for (String add : speeeds) {
temp.add(url+add);
}
page.addTargetRequests(temp);
}
}

}

@Override
public Site getSite() {
return site;
}

private static String fileName ;

private static List<String> speeeds;

public static void main(String[] args) throws IOException {

if(args.length>1 && args.length<3){
fileName = args[0];
speeeds = FileUtils.readLines(new File(args[1]));
}else{
fileName = "E:\\temp\\temp_pinyin_suggest.txt";
speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt"));
}

long one = System.currentTimeMillis();
for (String q : speeeds) {
long temp = System.currentTimeMillis();
String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q;
Spider.create(new TaobaoPinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).
thread(7).run();
System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");
}
long two = System.currentTimeMillis();
System.out.println("one end and time is :" + (two -one) + " ms");

}

}

public class TaobaoSuggestWordPageProcessor implements PageProcessor{

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.
me().
setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").
setRetryTimes(5).setSleepTime(100);

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//["童装","40827952"]
JSONArray a = JSON.parseArray(JSON.parseObject(page.getJson().toString()).get("result").toString());
for (Object aa : a) {
String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");
String substring = replace.substring(0,replace.indexOf(","));
try {
IOUtils.write(substring.getBytes(), new FileOutputStream(new File(fileName),true));
IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

if(a.size()>9){
String url = page.getUrl().toString();
//https://suggest.taobao.com/sug?code=utf-8&q=
int index = "https://suggest.taobao.com/sug?code=utf-8&q=".length();
String query = url.substring(index, url.length());

if(query.length() <4){
String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",
"s","d","f","g","h","j","k","l",
"z","x","c","v","b","n","m"};

List<String> temp = new ArrayList<String>();
for (String add : speeeds) {
temp.add(url+add);
}
page.addTargetRequests(temp);
}
}

}

@Override
public Site getSite() {
return site;
}

private static String fileName ;

public static void main(String[] args) throws IOException {
if(args.length>0 && args.length<2){
fileName = args[0];
}else{
fileName = "E:\\temp\\temp_tb_suggest.txt";
}

String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",
"s","d","f","g","h","j","k","l",
"z","x","c","v","b","n","m"};

long one = System.currentTimeMillis();
for (String q : speeeds) {

long temp = System.currentTimeMillis();

String url ="https://suggest.taobao.com/sug?code=utf-8&q="+q;
Spider.create(new TaobaoSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).
thread(7).run();

System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");

}
long two = System.currentTimeMillis();
System.out.println("one end and time is :" + (two -one) + " ms");

}

}

国美抓取:

public class GomeSuggestWordPageProcessor implements PageProcessor{

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.
me().
setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").
setRetryTimes(5).setSleepTime(100);

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//["童装","40827952"]
JSONArray a = JSON.parseArray(page.getJson().toString());
ArrayList<String> temp = new ArrayList<String>();
for (Object aa : a) {

if(aa.toString().indexOf("{\"cat\":")==-1){
String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");
String substring = replace.substring(0,replace.indexOf(","));
temp.add(substring);
}
}

for (int i = 0; i < temp.size(); i++) {
try {
IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true));
IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

}

if(temp.size()>8){

String url = page.getUrl().toString();
//https://suggest.taobao.com/sug?code=utf-8&q=
int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length();
String query = url.substring(index, url.length());
if(query.length() <4){
String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",
"s","d","f","g","h","j","k","l",
"z","x","c","v","b","n","m"};

List<String> temps = new ArrayList<String>();
for (String add : speeeds) {
temps.add(url+add);
}
page.addTargetRequests(temps);
}
}
}

@Override
public Site getSite() {
return site;
}

private static String fileName ;

public static void main(String[] args) throws IOException {
if(args.length>0 && args.length<2){
fileName = args[0];
}else{
fileName = "E:\\temp\\temp_gome_suggest.txt";
}

String[] speeeds = {"q","w","e","r","t","y","u","i","o","p","a",
"s","d","f","g","h","j","k","l",
"z","x","c","v","b","n","m"};
long one = System.currentTimeMillis();
for (String q : speeeds) {
long temp = System.currentTimeMillis();
String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q;
Spider.create(new GomeSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).
thread(7).run();
System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");
}
long two = System.currentTimeMillis();
System.out.println("one end and time is :" + (two -one) + " ms");
}

}

public class GomePinyinSuggestWordPageProcessor implements PageProcessor{

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.
me().
setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36ss").
setRetryTimes(5).setSleepTime(50);

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//["童装","40827952"]
JSONArray a = JSON.parseArray(page.getJson().toString());
ArrayList<String> temp = new ArrayList<String>();
for (Object aa : a) {

if(aa.toString().indexOf("{\"cat\":")==-1){
String replace = aa.toString().replace("[", "").replace("]", "").replace("\"", "");
String substring = replace.substring(0,replace.indexOf(","));
temp.add(substring);
}
}

for (int i = 0; i < temp.size(); i++) {
try {
IOUtils.write(temp.get(i).getBytes(), new FileOutputStream(new File(fileName),true));
IOUtils.write(IOUtils.LINE_SEPARATOR.getBytes(), new FileOutputStream(new File(fileName),true));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

}

if(temp.size()>8){
String url = page.getUrl().toString();
//https://suggest.taobao.com/sug?code=utf-8&q=
int index = "http://api.search.gome.com.cn/p/suggest?from=headSearch&query=".length();
String query = url.substring(index, url.length());
char[] chars = query.toCharArray();
int num = 0;
for (char c : chars) {
if(c>=65 && c <= 90){
num++;
}
}
if(num <3){
List<String> temps = new ArrayList<String>();
for (String add : speeeds) {
temps.add(url+add);
}
page.addTargetRequests(temps);
}
}
}

@Override
public Site getSite() {
return site;
}

private static String fileName ;

private static List<String> speeeds;

public static void main(String[] args) throws IOException {

if(args.length>1 && args.length<3){
fileName = args[0];
speeeds = FileUtils.readLines(new File(args[1]));
}else{
fileName = "E:\\temp\\temp_pinyin_gome_suggest.txt";
speeeds = FileUtils.readLines(new File("E:\\temp\\pinyin.txt"));
}

long one = System.currentTimeMillis();
for (String q : speeeds) {
long temp = System.currentTimeMillis();
String url ="http://api.search.gome.com.cn/p/suggest?from=headSearch&query="+q;
Spider.create(new GomePinyinSuggestWordPageProcessor()).addPipeline(new ConsolePipeline()).addUrl(url).
thread(5).run();
System.out.println("the speeed is : "+q+"  end and time is :" + (System.currentTimeMillis() -temp) + " ms");
}
long two = System.currentTimeMillis();
System.out.println("one end and time is :" + (two -one) + " ms");

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐