抓取csdn指定用户的博文
2014-04-11 09:57
309 查看
http请求类:
package com.blog.collection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
public class HttpRequest {
/**
* 向指定URL发送GET方法的请求
*
* @param url
* 发送请求的URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60");
connection.setRequestProperty("Content-Encoding", " gzip");
connection.setRequestProperty("user-agent"," Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0");
// 建立实际的连接
connection.connect();
// 获取所有响应头字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
public static String send(String url){
String result = "";
BufferedReader in = null;
try {
String urlNameString = url;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 建立实际的连接
connection.connect();
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 向指定 URL 发送POST方法的请求
*
* @param url
* 发送请求的 URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return 所代表远程资源的响应结果
*/
public static String sendPost(String url, String param) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
URLConnection conn = realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(conn.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
// 定义BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
//使用finally块来关闭输出流、输入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
}
处理类:
package com.blog.collection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.blog.model.Blog;
public class CollectionHandler {
private Progress progress;
public void setProgress(Progress progress) {
this.progress = progress;
}
public Progress getProgress() {
return progress;
}
public void go(String user){
HttpRequest request=new HttpRequest();
System.out.println("加载中...");
String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", "");
//获取页码-摘要视图
String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");
Integer code=count.equals("")?0:Integer.parseInt(count);
List<String> urls=new ArrayList<String>();
getUrls(content, urls, null);
for(int i=2;i<=code;i++){
getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);
}
System.out.println("数量:"+urls.size());
for (String string : urls) {
System.out.println(string);
handler(string);
}
System.out.println("处理完成");
}
public void getUrls(String text,List<String> urls,String url){
HttpRequest request=new HttpRequest();
String content=null;
if(text==null){
content=request.sendGet(url, "");
}else{
content=text;
}
String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
urls.add("http://blog.csdn.net"+matcher.group());
}
}
/**
* 处理博文
* @param url
*/
public void handler(String url){
Blog blog=new Blog();
HttpRequest request=new HttpRequest();
String content=request.sendGet(url, "");
//System.out.println(content);
String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";
//标题
String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");
System.out.println("标题");
System.out.println(title);
blog.setTitle(title);
//文章内容
regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";
System.out.println("博文");
String text=matcher(content, regex);
blog.setContent(text);
//分类
regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";
System.out.println("分类");
String type=matcher(content, regex);
blog.setTags(type);
System.out.println(type);
if(this.progress!=null){
progress.handler(blog, type);
}
}
public String matcher(String content,String regex){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
String group = matcher.group(0);
return group;
}
return "";
}
}
package com.blog.collection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
public class HttpRequest {
/**
* 向指定URL发送GET方法的请求
*
* @param url
* 发送请求的URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60");
connection.setRequestProperty("Content-Encoding", " gzip");
connection.setRequestProperty("user-agent"," Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0");
// 建立实际的连接
connection.connect();
// 获取所有响应头字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
public static String send(String url){
String result = "";
BufferedReader in = null;
try {
String urlNameString = url;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 建立实际的连接
connection.connect();
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 向指定 URL 发送POST方法的请求
*
* @param url
* 发送请求的 URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return 所代表远程资源的响应结果
*/
public static String sendPost(String url, String param) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
URLConnection conn = realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(conn.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
// 定义BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
//使用finally块来关闭输出流、输入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
}
处理类:
package com.blog.collection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.blog.model.Blog;
public class CollectionHandler {
private Progress progress;
public void setProgress(Progress progress) {
this.progress = progress;
}
public Progress getProgress() {
return progress;
}
public void go(String user){
HttpRequest request=new HttpRequest();
System.out.println("加载中...");
String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", "");
//获取页码-摘要视图
String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");
Integer code=count.equals("")?0:Integer.parseInt(count);
List<String> urls=new ArrayList<String>();
getUrls(content, urls, null);
for(int i=2;i<=code;i++){
getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);
}
System.out.println("数量:"+urls.size());
for (String string : urls) {
System.out.println(string);
handler(string);
}
System.out.println("处理完成");
}
public void getUrls(String text,List<String> urls,String url){
HttpRequest request=new HttpRequest();
String content=null;
if(text==null){
content=request.sendGet(url, "");
}else{
content=text;
}
String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
urls.add("http://blog.csdn.net"+matcher.group());
}
}
/**
* 处理博文
* @param url
*/
public void handler(String url){
Blog blog=new Blog();
HttpRequest request=new HttpRequest();
String content=request.sendGet(url, "");
//System.out.println(content);
String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";
//标题
String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");
System.out.println("标题");
System.out.println(title);
blog.setTitle(title);
//文章内容
regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";
System.out.println("博文");
String text=matcher(content, regex);
blog.setContent(text);
//分类
regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";
System.out.println("分类");
String type=matcher(content, regex);
blog.setTags(type);
System.out.println(type);
if(this.progress!=null){
progress.handler(blog, type);
}
}
public String matcher(String content,String regex){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
String group = matcher.group(0);
return group;
}
return "";
}
}
相关文章推荐
- java研发爬虫,抓取知乎,CSDN用户信息
- python抓取CSDN博客首页的所有博文,对标题分词存入mongodb中
- php蜘蛛正常抓取,用户跳转指定页面
- Python简单抓取CSDN博文列表并写入SQL Server数据库
- 抓取CSDN个人的用户访问量并且发邮件
- CSDN-Markdown更轻松地记录你的技术博文,感谢CSDN热心博友的分享!
- CSDN 用户登录
- Fedora开机自动登录指定用户(root或普通用户)
- Oracle限制用户只能从指定IP登录
- python抓取网页图片并放到指定文件夹
- 在Windows Server 2008 R2中批量更新指定OU下的所有用户口令
- JavaScript的变量提升(转自CSDN_blog sunxing007用户)
- 如何给VSFTP增加用户,只能访问指定目录
- Sharepoint学习笔记—Ribbon系列-- 7. 在Ribbon中替换指定控件(针对用户自定义Tab)
- SharePoint【Ribbon系列】-- 07.在Ribbon中替换指定控件(针对用户自定义Tab)
- [ 活动 ] CSDN 用户体验有奖调查 / 幸运者将获得200 C币
- 给CSDN提个用户登陆建议--添加扫码登录功能
- sqlserver 创建用户只能访问指定视图
- Centos7 修改mysql指定用户的密码
- Python3 urllib抓取指定URL的内容