JAVA使用HttpClient实现爬虫技术
2018-03-27 16:01
387 查看
1. pom文件中加入httpClient依赖包
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.1</version>
</dependency>2. 创建一个调用httpClient的工具类
public class HttpClientUtil {
private CloseableHttpClient closeableHttpClient;
private RequestConfig requestConfig;
// 最大的连接数
private int maxTotal = 10;
// 最大的并发数
private int defaultMaxPerRoute = 5;
// 连接超时数
private int connectTimeOut = 2000;
// 数据传输的最长时间
private int socketTimeout = 10000;
// 在连接之前测试连接可不可用
private boolean staleConnectionCheckEnabled = true;
// 从数据池中获取连接的最长时间
private int connectionRequestTimeOut = 500;
public HttpClientUtil() {
createCloseableHttpClient();
createRequestConfig();
}
/**
* 创建CloseableHttpClient
*/
private void createCloseableHttpClient() {
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
connectionManager.setMaxTotal(maxTotal);
connectionManager.setDefaultMaxPerRoute(defaultMaxPerRoute);
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
httpClientBuilder.setConnectionManager(connectionManager);
this.closeableHttpClient = httpClientBuilder.build();
}
/**
* 创建requestConfig
*/
private void createRequestConfig() {
RequestConfig.Builder custom = RequestConfig.custom();
custom.setConnectTimeout(connectTimeOut)
.setSocketTimeout(socketTimeout)
.setStaleConnectionCheckEnabled(staleConnectionCheckEnabled)
.setConnectionRequestTimeout(connectionRequestTimeOut);
this.requestConfig = custom.build();
}
/**
* get请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doGet(String url) throws Exception {
// 先获取地址的请求对象
HttpGet httpGet = new HttpGet(url);
// 配置参数
httpGet.setConfig(requestConfig);
// 执行请求
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}
return null;
}
/**
* get请求带参数
* @param url
* @param map
* @return
* @throws Exception
*/
public String doGet(String url, Map<String, Object> map) throws Exception {
URIBuilder uriBuilder = new URIBuilder(url);
if (map != null) {
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String, Object> entry : entrySet) {
uriBuilder.addParameter(entry.getKey(),entry.getValue().toString());
}
}
return this.doGet(uriBuilder.build().toString());
}
/**
* 带参数的post请求
* @param url
* @param map
* @return
* @throws Exception
*/
public String doPost(String url, Map<String, Object> map) throws Exception {
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
if (map != null) {
List<NameValuePair> pairList = new ArrayList<>();
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String,Object> entry : entrySet) {
pairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue().toString()));
}
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(pairList);
httpPost.setEntity(entity);
}
CloseableHttpResponse response = closeableHttpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}
return null;
}
/**
* post请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doPost(String url) throws Exception {
return this.doPost(url,null);
}
3. 利用unit进行单元测试
@Test
public void test1() {
String url = "https://blog.csdn.net/javalixy/article/details/76284524";
HttpClientUtil clientUtil = new HttpClientUtil();
try {
String result = clientUtil.doGet(url);
parseHtml(result);
} catch (Exception e) {
e.printStackTrace();
}可以得到返回的Html页面
4.使用开源框架Jsoup进行html页面的解析
4.1 加入jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>4.2 jsoup解析页面
private void parseHtml(String result) {
Document document = Jsoup.parse(result);
Elements linkElements = document.select("link[href]");
Elements textElements = document.select("span");
Elements imgElements = document.select("img");
System.out.println(String.format("LinkElements: (%d)", linkElements.size()));
System.out.println(String.format("TextElements: (%d)", textElements.size()));
System.out.println(String.format("ImgElements: (%d)", imgElements.size()));
for(Element link : linkElements){
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
for (Element text : textElements) {
print("* text: <%s> (%s)",text.attr("abs:class"), trim(text.text(),35));
}
for (Element img : imgElements) {
print("* text: <%s> (%s)",img.attr("abs:src"), trim(img.text(),35));
}
}
private void print(String str , Object...msg) {
System.out.println(String.format(str,msg));
}
private static String trim(String str, int width){
if(str.length() > width) {
return str.substring(0, width + 1) + ".";
} else {
return str;
}
}
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.1</version>
</dependency>2. 创建一个调用httpClient的工具类
public class HttpClientUtil {
private CloseableHttpClient closeableHttpClient;
private RequestConfig requestConfig;
// 最大的连接数
private int maxTotal = 10;
// 最大的并发数
private int defaultMaxPerRoute = 5;
// 连接超时数
private int connectTimeOut = 2000;
// 数据传输的最长时间
private int socketTimeout = 10000;
// 在连接之前测试连接可不可用
private boolean staleConnectionCheckEnabled = true;
// 从数据池中获取连接的最长时间
private int connectionRequestTimeOut = 500;
public HttpClientUtil() {
createCloseableHttpClient();
createRequestConfig();
}
/**
* 创建CloseableHttpClient
*/
private void createCloseableHttpClient() {
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
connectionManager.setMaxTotal(maxTotal);
connectionManager.setDefaultMaxPerRoute(defaultMaxPerRoute);
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
httpClientBuilder.setConnectionManager(connectionManager);
this.closeableHttpClient = httpClientBuilder.build();
}
/**
* 创建requestConfig
*/
private void createRequestConfig() {
RequestConfig.Builder custom = RequestConfig.custom();
custom.setConnectTimeout(connectTimeOut)
.setSocketTimeout(socketTimeout)
.setStaleConnectionCheckEnabled(staleConnectionCheckEnabled)
.setConnectionRequestTimeout(connectionRequestTimeOut);
this.requestConfig = custom.build();
}
/**
* get请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doGet(String url) throws Exception {
// 先获取地址的请求对象
HttpGet httpGet = new HttpGet(url);
// 配置参数
httpGet.setConfig(requestConfig);
// 执行请求
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}
return null;
}
/**
* get请求带参数
* @param url
* @param map
* @return
* @throws Exception
*/
public String doGet(String url, Map<String, Object> map) throws Exception {
URIBuilder uriBuilder = new URIBuilder(url);
if (map != null) {
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String, Object> entry : entrySet) {
uriBuilder.addParameter(entry.getKey(),entry.getValue().toString());
}
}
return this.doGet(uriBuilder.build().toString());
}
/**
* 带参数的post请求
* @param url
* @param map
* @return
* @throws Exception
*/
public String doPost(String url, Map<String, Object> map) throws Exception {
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(requestConfig);
if (map != null) {
List<NameValuePair> pairList = new ArrayList<>();
Set<Map.Entry<String, Object>> entrySet = map.entrySet();
for (Map.Entry<String,Object> entry : entrySet) {
pairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue().toString()));
}
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(pairList);
httpPost.setEntity(entity);
}
CloseableHttpResponse response = closeableHttpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) {
return EntityUtils.toString(response.getEntity(),"UTF-8");
}
return null;
}
/**
* post请求不带参数
* @param url
* @return
* @throws Exception
*/
public String doPost(String url) throws Exception {
return this.doPost(url,null);
}
3. 利用unit进行单元测试
@Test
public void test1() {
String url = "https://blog.csdn.net/javalixy/article/details/76284524";
HttpClientUtil clientUtil = new HttpClientUtil();
try {
String result = clientUtil.doGet(url);
parseHtml(result);
} catch (Exception e) {
e.printStackTrace();
}可以得到返回的Html页面
4.使用开源框架Jsoup进行html页面的解析
4.1 加入jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>4.2 jsoup解析页面
private void parseHtml(String result) {
Document document = Jsoup.parse(result);
Elements linkElements = document.select("link[href]");
Elements textElements = document.select("span");
Elements imgElements = document.select("img");
System.out.println(String.format("LinkElements: (%d)", linkElements.size()));
System.out.println(String.format("TextElements: (%d)", textElements.size()));
System.out.println(String.format("ImgElements: (%d)", imgElements.size()));
for(Element link : linkElements){
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
for (Element text : textElements) {
print("* text: <%s> (%s)",text.attr("abs:class"), trim(text.text(),35));
}
for (Element img : imgElements) {
print("* text: <%s> (%s)",img.attr("abs:src"), trim(img.text(),35));
}
}
private void print(String str , Object...msg) {
System.out.println(String.format(str,msg));
}
private static String trim(String str, int width){
if(str.length() > width) {
return str.substring(0, width + 1) + ".";
} else {
return str;
}
}
相关文章推荐
- [Java]使用HttpClient实现一个简单爬虫,抓取煎蛋妹子图
- 使用JNI技术实现JAVA程序调用dll
- 使用JNI技术实现JAVA程序调用dll
- 使用JNI技术实现JAVA程序调用dll
- Chap5:使用JNI技术实现java程序调用第三方dll(c/c++)文件的功能
- 最近在做实验希望实现基于JNI技术在Java中使用 Slex.dll
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用Java技术在Cocoon中实现商业逻辑
- 使用HttpClient和HtmlParser实现网络爬虫
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- [Java] 知乎下巴第5集:使用HttpClient工具包和宽度爬虫
- JAVA 使用HttpClient 实现简单发送HTTP请求 无返回结果
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用JAVA技术实现新一代OSS/BSS
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用 HttpClient 和 HtmlParser 实现简易爬虫
- 使用JNI技术实现JAVA程序调用dll