您的位置:首页 > Web前端 > JavaScript

Jsoup爬数据+设置代理IP

2016-08-13 00:25 141 查看
本文利用Jsoup工具从网站中爬IP,然后动态改变本地IP进行远程访问。

主要工作类:

public class Test {

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
parse();
}

public static void parse() {

// blogBody("");
List<String> list = null;
try {
list = getHtml();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

String path = "/Users/tianjia/Documents/article";
List<String> articles = FileUtil.getListFromFile(path);
ExecutorService executorService = Executors.newCachedThreadPool();
int len_article = articles.size();
for (int i = 0; i < len_article; i++) {
executorService.execute(new MyRun(articles.get(i), list));
}
}

private static List<String> getHtml() throws IOException {
Document doc = null;
try {
// doc = Jsoup.connect("http://www.baidu.com")
doc = Jsoup.connect("http://www.xicidaili.com/nt")
// .data("query", "Java")
.userAgent("Mozilla")
// .cookie("auth", "token")
// .timeout(3000)
.get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
List<String> list = new ArrayList<String>();
Elements elements = doc.select("tr.odd");
int len = elements.size();
Element element = null;
for (int i = 0; i < len; i++) {
element = elements.get(i);
StringBuilder sBuilder = new StringBuilder(20);
sBuilder.append(element.child(1).text());
sBuilder.append(":");
sBuilder.append(element.child(2).text());
list.add(sBuilder.toString());
}
// System.out.println(doc.html());
doc = null;
elements.clear();
elements = null;
return list;
}

public static void visit(String ip, String url){
// prop.setProperty("http.proxyHost", "183.45.78.31");
// 设置http访问要使用的代理服务器的端口
// prop.setProperty("http.proxyPort", "8080");
String[] r = ip.split(":");
System.getProperties().setProperty("http.proxyHost", r[0]);
System.getProperties().setProperty("http.proxyPort", r[1]);
try {
// doc = Jsoup.connect("http://www.baidu.com")
Jsoup.connect(url)
// .data("query", "Java")
.userAgent("Mozilla")
// .cookie("auth", "token")
// .timeout(3000)
.get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}


自定义线程类:

public class MyRun implements Runnable{

private List<String> list;
private String urlString;
public MyRun(String url,List<String> list) {
this.list =  list;
this.urlString = url;
}
@Override
public void run() {
// TODO Auto-generated method stub
int len = list.size();
for (int i = 0; i < len; i++) {
Test.visit(list.get(i), urlString);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

}


文件操作类:

public class FileUtil {

public static List<String> getListFromFile(String  path){
List<String> list = new ArrayList<>();
String data = null;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
while((data = br.readLine())!=null)
{
System.out.println(data);
list.add(data);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: