您的位置:首页 > 其它

抓取csdn指定用户的博文

2014-04-11 09:57 309 查看
http请求类:

package com.blog.collection;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;

public class HttpRequest {
/**
* 向指定URL发送GET方法的请求
*
* @param url
* 发送请求的URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60");
connection.setRequestProperty("Content-Encoding", " gzip");
connection.setRequestProperty("user-agent"," Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0");
// 建立实际的连接
connection.connect();
// 获取所有响应头字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}

public static String send(String url){
String result = "";
BufferedReader in = null;
try {
String urlNameString = url;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 建立实际的连接
connection.connect();
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line+"\n";
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}

/**
* 向指定 URL 发送POST方法的请求
*
* @param url
* 发送请求的 URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return 所代表远程资源的响应结果
*/
public static String sendPost(String url, String param) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
URLConnection conn = realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(conn.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
// 定义BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
//使用finally块来关闭输出流、输入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
}

处理类:
package com.blog.collection;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.blog.model.Blog;

public class CollectionHandler {

private Progress progress;
public void setProgress(Progress progress) {
this.progress = progress;
}
public Progress getProgress() {
return progress;
}
public void go(String user){
HttpRequest request=new HttpRequest();
System.out.println("加载中...");
String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", "");
//获取页码-摘要视图
String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");
Integer code=count.equals("")?0:Integer.parseInt(count);
List<String> urls=new ArrayList<String>();
getUrls(content, urls, null);
for(int i=2;i<=code;i++){
getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);
}
System.out.println("数量:"+urls.size());
for (String string : urls) {
System.out.println(string);
handler(string);
}
System.out.println("处理完成");
}

public void getUrls(String text,List<String> urls,String url){
HttpRequest request=new HttpRequest();

String content=null;
if(text==null){
content=request.sendGet(url, "");
}else{
content=text;
}
String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
urls.add("http://blog.csdn.net"+matcher.group());
}
}

/**
* 处理博文
* @param url
*/
public void handler(String url){
Blog blog=new Blog();
HttpRequest request=new HttpRequest();
String content=request.sendGet(url, "");
//System.out.println(content);
String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";
//标题
String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");
System.out.println("标题");
System.out.println(title);
blog.setTitle(title);
//文章内容
regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";
System.out.println("博文");
String text=matcher(content, regex);
blog.setContent(text);
//分类
regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";
System.out.println("分类");
String type=matcher(content, regex);
blog.setTags(type);
System.out.println(type);
if(this.progress!=null){
progress.handler(blog, type);
}
}

public String matcher(String content,String regex){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
String group = matcher.group(0);
return group;
}
return "";
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  csdn