您的位置:首页 > 理论基础 > 计算机网络

JAVA使用HttpClient实现爬虫技术

2018-03-27 16:01 387 查看
1. pom文件中加入httpClient依赖包
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.1</version>
</dependency>2. 创建一个调用httpClient的工具类
public class HttpClientUtil {

private CloseableHttpClient closeableHttpClient;

private RequestConfig requestConfig;

// 最大的连接数
private int maxTotal = 10;

// 最大的并发数
private int defaultMaxPerRoute = 5;

// 连接超时数
private int connectTimeOut = 2000;

// 数据传输的最长时间
private int socketTimeout = 10000;

// 在连接之前测试连接可不可用
private boolean staleConnectionCheckEnabled = true;

// 从数据池中获取连接的最长时间
private int connectionRequestTimeOut = 500;

public HttpClientUtil() {

createCloseableHttpClient();
createRequestConfig();
}

/**
* 创建CloseableHttpClient
*/
private void createCloseableHttpClient() {

PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
connectionManager.setMaxTotal(maxTotal);
connectionManager.setDefaultMaxPerRoute(defaultMaxPerRoute);

HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
httpClientBuilder.setConnectionManager(connectionManager);
this.closeableHttpClient = httpClientBuilder.build();
}

/**
* 创建requestConfig
*/
private void createRequestConfig() {

RequestConfig.Builder custom = RequestConfig.custom();
custom.setConnectTimeout(connectTimeOut)
.setSocketTimeout(socketTimeout)
.setStaleConnectionCheckEnabled(staleConnectionCheckEnabled)
.setConnectionRequestTimeout(connectionRequestTimeOut);

this.requestConfig = custom.build();

}

/**
* get请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doGet(String url) throws Exception {

// 先获取地址的请求对象
HttpGet httpGet = new HttpGet(url);
// 配置参数
httpGet.setConfig(requestConfig);
// 执行请求
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);

if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}
return null;
}

/**
* get请求带参数
* @param url
* @param map
* @return
* @throws Exception
*/
public String doGet(String url, Map<String, Object> map) throws Exception {

URIBuilder uriBuilder = new URIBuilder(url);
if (map != null) {
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String, Object> entry : entrySet) {
uriBuilder.addParameter(entry.getKey(),entry.getValue().toString());
}
}
return this.doGet(uriBuilder.build().toString());
}

/**
* 带参数的post请求
* @param url
* @param map
* @return
* @throws Exception
*/
public String doPost(String url, Map<String, Object> map) throws Exception {

HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
if (map != null) {
List<NameValuePair> pairList = new ArrayList<>();
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String,Object> entry : entrySet) {
pairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue().toString()));
}
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(pairList);
httpPost.setEntity(entity);
}

CloseableHttpResponse response = closeableHttpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}

return null;
}

/**
* post请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doPost(String url) throws Exception {

return this.doPost(url,null);
}
3. 利用unit进行单元测试
@Test
public void test1() {

String url = "https://blog.csdn.net/javalixy/article/details/76284524";
HttpClientUtil clientUtil = new HttpClientUtil();
try {
String result = clientUtil.doGet(url);
parseHtml(result);
} catch (Exception e) {
e.printStackTrace();
}可以得到返回的Html页面
4.使用开源框架Jsoup进行html页面的解析
4.1  加入jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>4.2 jsoup解析页面
private void parseHtml(String result) {

Document document = Jsoup.parse(result);
Elements linkElements = document.select("link[href]");
Elements textElements = document.select("span");
Elements imgElements = document.select("img");

System.out.println(String.format("LinkElements: (%d)", linkElements.size()));
System.out.println(String.format("TextElements: (%d)", textElements.size()));
System.out.println(String.format("ImgElements: (%d)", imgElements.size()));

for(Element link : linkElements){
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
for (Element text : textElements) {
print("* text: <%s> (%s)",text.attr("abs:class"), trim(text.text(),35));
}
for (Element img : imgElements) {
print("* text: <%s> (%s)",img.attr("abs:src"), trim(img.text(),35));
}
}

private void print(String str , Object...msg) {
System.out.println(String.format(str,msg));
}

private static String trim(String str, int width){
if(str.length() > width) {
return str.substring(0, width + 1) + ".";
} else {
return str;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java