扒网页内容
2016-04-30 13:06
387 查看
近来,师弟开始学javaweb,于是叫他们模仿糗百网站学习,便写了这段代码,让他们能快速从糗百弄下几页笑话来充场面。代码如下,删减了一些内容
/**
* 弄下糗事百科笑话示例
* @author gmr
*
*/
public class GetQSBK {
private static HttpClient httpClient = new DefaultHttpClient();
public static void main(String[] args) throws Exception {
Pattern pattern = Pattern
.compile("<div class=\"content\">([\\s\\S]*?)</div");
Matcher matcher = null;
int count = 0;
while (count++ < 3) {
/*
* 生成get请求对象
*/
HttpGet httpGet = new HttpGet(
"http://www.qiushibaike.com/8hr/page/" + count
+ "/?s=4873294");
StringBuilder str = null;
/*
* 设置请求头
*/
httpGet.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "deflate, sdch");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Host", "www.qiushibaike.com");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36");
str = new StringBuilder(
showResponseResult(httpClient.execute(httpGet)));
/*
* 正则解析html标签
*/
matcher = pattern.matcher(str.toString());
System.out.println("**************************");
System.out.println("这是第" + count + "页");
while (matcher.find()) {
System.out.println(matcher.group(1);//打印笑话内容
}
}
}
/**
* 获取内容实体
*
* @param response
* @return
* @throws Exception
*/
private static String showResponseResult(HttpResponse response)
throws Exception {
if (null == response) {
return null;
}
Header[] headers = response.getAllHeaders();
for (Header header : headers) {
if (header.getName().equals("Set-Cookie")) {
cookie = header.getValue();
}
}
HttpEntity httpEntity = response.getEntity();
InputStream inputStream = httpEntity.getContent();
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
byte[] b = new byte[1024];
int len = 0;
while (-1 != (len = inputStream.read(b))) {
byteOut.write(b, 0, len);
}
String data = byteOut.toString();
return data;
}
}
/**
* 弄下糗事百科笑话示例
* @author gmr
*
*/
public class GetQSBK {
private static HttpClient httpClient = new DefaultHttpClient();
public static void main(String[] args) throws Exception {
Pattern pattern = Pattern
.compile("<div class=\"content\">([\\s\\S]*?)</div");
Matcher matcher = null;
int count = 0;
while (count++ < 3) {
/*
* 生成get请求对象
*/
HttpGet httpGet = new HttpGet(
"http://www.qiushibaike.com/8hr/page/" + count
+ "/?s=4873294");
StringBuilder str = null;
/*
* 设置请求头
*/
httpGet.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "deflate, sdch");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Host", "www.qiushibaike.com");
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36");
str = new StringBuilder(
showResponseResult(httpClient.execute(httpGet)));
/*
* 正则解析html标签
*/
matcher = pattern.matcher(str.toString());
System.out.println("**************************");
System.out.println("这是第" + count + "页");
while (matcher.find()) {
System.out.println(matcher.group(1);//打印笑话内容
}
}
}
/**
* 获取内容实体
*
* @param response
* @return
* @throws Exception
*/
private static String showResponseResult(HttpResponse response)
throws Exception {
if (null == response) {
return null;
}
Header[] headers = response.getAllHeaders();
for (Header header : headers) {
if (header.getName().equals("Set-Cookie")) {
cookie = header.getValue();
}
}
HttpEntity httpEntity = response.getEntity();
InputStream inputStream = httpEntity.getContent();
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
byte[] b = new byte[1024];
int len = 0;
while (-1 != (len = inputStream.read(b))) {
byteOut.write(b, 0, len);
}
String data = byteOut.toString();
return data;
}
}
相关文章推荐
- RPC failed; result=22, HTTP code = 411
- HTTP Header 属性列表
- nginx中http核心模块的配置指令2
- nginx中http核心模块的配置指令3
- nginx中http核心模块的配置指令4
- nginx中http的fastcgi模块的配置指令1
- 如何在 Linux 中快速地通过 HTTP 提供文件访问服务
- 深入HTTP head的使用详解
- Ruby程序中发送基于HTTP协议的请求的简单示例
- ASP 中使用 HTTP 协议发送参数详解
- C#基于socket模拟http请求的方法
- http www安全必备知识
- asp HTTP 500错误 常见问题分析
- http代理相关知识分析
- 在Node.js中使用HTTP上传文件的方法
- php错误提示failed to open stream: HTTP request failed!的完美解决方法
- php提示Failed to write session data错误的解决方法