您的位置：首页 > 理论基础 > 计算机网络

java中利用开源HttpClient包抓取网页

2013-04-19 21:42 435 查看

1.用到的jar包：

commons-logging.jar

commons-httpclient.jar(此包的版本为3.1)

commons-codec.jar

2.源码：

package test;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;

public class Test2 {
private static HttpClient httpclient=new HttpClient();

//设置代理服务器
/*static {
httpclient.getHostConfiguration().setProxy("192.168.0.1", 8080);
}*/

public static boolean downloadPage(String path) throws HttpException, IOException{

//得到post方法
PostMethod postmethod=new PostMethod(path);

//设置post方法的参数
NameValuePair[] postData=new NameValuePair[1];
postData[0]=new NameValuePair("name","google");
//postData[1]=new NameValuePair("password","000");
postmethod.addParameters(postData);

//返回状态执行码
int statusCode =httpclient.executeMethod(postmethod);
System.out.print(statusCode);
//针对状态码进行处理
if(statusCode==HttpStatus.SC_OK){

//得到文件名(将http：//去掉并将“/”替换为空)
String str=path.substring(7).replaceAll("/", "");

//判断文件是否存在，若存在删除重新建立
File file =new File("d:"+File.separator+"page"+File.separator+str+".html");
if(file.exists()){
file.delete();
file.createNewFile();
}

//创建输入流
InputStream input=postmethod.getResponseBodyAsStream();

//文件输出流
DataOutputStream dos=new DataOutputStream(new FileOutputStream(file));

//将文件写入
byte[] buffer=new byte[1024];
int len=0;
while((len=input.read(buffer))>0){
dos.write(buffer, 0, len);
}

//关闭流
if(input!=null){
input.close();
}
if(dos!=null){
dos.close();
}

//提示
System.err.println("抓取页面成功");
return true;
}
return false;
}

public static void main(String []args){
try{
Test2.downloadPage("http://lietu.com/");
}catch(HttpException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： Java httpclient 网页抓取

相关文章推荐

新的分享

章节导航