java中利用开源HttpClient包抓取网页
2013-04-19 21:42
435 查看
1.用到的jar包:
commons-logging.jar
commons-httpclient.jar(此包的版本为3.1)
commons-codec.jar
2.源码:
package test;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
public class Test2 {
private static HttpClient httpclient=new HttpClient();
//设置代理服务器
/*static {
httpclient.getHostConfiguration().setProxy("192.168.0.1", 8080);
}*/
public static boolean downloadPage(String path) throws HttpException, IOException{
//得到post方法
PostMethod postmethod=new PostMethod(path);
//设置post方法的参数
NameValuePair[] postData=new NameValuePair[1];
postData[0]=new NameValuePair("name","google");
//postData[1]=new NameValuePair("password","000");
postmethod.addParameters(postData);
//返回状态执行码
int statusCode =httpclient.executeMethod(postmethod);
System.out.print(statusCode);
//针对状态码进行处理
if(statusCode==HttpStatus.SC_OK){
//得到文件名(将http://去掉并将“/”替换为空)
String str=path.substring(7).replaceAll("/", "");
//判断文件是否存在,若存在删除重新建立
File file =new File("d:"+File.separator+"page"+File.separator+str+".html");
if(file.exists()){
file.delete();
file.createNewFile();
}
//创建输入流
InputStream input=postmethod.getResponseBodyAsStream();
//文件输出流
DataOutputStream dos=new DataOutputStream(new FileOutputStream(file));
//将文件写入
byte[] buffer=new byte[1024];
int len=0;
while((len=input.read(buffer))>0){
dos.write(buffer, 0, len);
}
//关闭流
if(input!=null){
input.close();
}
if(dos!=null){
dos.close();
}
//提示
System.err.println("抓取页面成功");
return true;
}
return false;
}
public static void main(String []args){
try{
Test2.downloadPage("http://lietu.com/");
}catch(HttpException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}
}
}
commons-logging.jar
commons-httpclient.jar(此包的版本为3.1)
commons-codec.jar
2.源码:
package test;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
public class Test2 {
private static HttpClient httpclient=new HttpClient();
//设置代理服务器
/*static {
httpclient.getHostConfiguration().setProxy("192.168.0.1", 8080);
}*/
public static boolean downloadPage(String path) throws HttpException, IOException{
//得到post方法
PostMethod postmethod=new PostMethod(path);
//设置post方法的参数
NameValuePair[] postData=new NameValuePair[1];
postData[0]=new NameValuePair("name","google");
//postData[1]=new NameValuePair("password","000");
postmethod.addParameters(postData);
//返回状态执行码
int statusCode =httpclient.executeMethod(postmethod);
System.out.print(statusCode);
//针对状态码进行处理
if(statusCode==HttpStatus.SC_OK){
//得到文件名(将http://去掉并将“/”替换为空)
String str=path.substring(7).replaceAll("/", "");
//判断文件是否存在,若存在删除重新建立
File file =new File("d:"+File.separator+"page"+File.separator+str+".html");
if(file.exists()){
file.delete();
file.createNewFile();
}
//创建输入流
InputStream input=postmethod.getResponseBodyAsStream();
//文件输出流
DataOutputStream dos=new DataOutputStream(new FileOutputStream(file));
//将文件写入
byte[] buffer=new byte[1024];
int len=0;
while((len=input.read(buffer))>0){
dos.write(buffer, 0, len);
}
//关闭流
if(input!=null){
input.close();
}
if(dos!=null){
dos.close();
}
//提示
System.err.println("抓取页面成功");
return true;
}
return false;
}
public static void main(String []args){
try{
Test2.downloadPage("http://lietu.com/");
}catch(HttpException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}
}
}
相关文章推荐
- 利用httpclient开源工具抓取网页的源码,并且控制台打印出来工具类的编写
- Java基础:利用HttpClient获取网页内容
- 利用HttpClient 获取网页数据java代码模版
- java httpclient 抓取网页 POST GET
- Java基础:利用HttpClient获取网页内容
- 利用httpclient抓取网页内容
- Java网页数据抓取实例(httpclient4.2.1+jsoup1.7.2)
- 利用httpclient抓取网页内容
- HttpClient(二)-- 模拟浏览器抓取网页
- JAVA抓取网页的图片,JAVA利用正则…
- 利用HttpClient抓取话费详单等信息
- java利用url实现网页内容的抓取
- Java利用httpasyncclient进行异步HTTP请求
- Java、C#双语版HttpHelper类(解决网页抓取乱码问题)
- java利用url实现网页内容的抓取
- HttpClient抓取网页文件方法
- httpclient抓取https网页数据
- 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页
- java 利用httpclient 3.1 和 httpclient4.1.2发送post请求
- Java HttpURLConnection 抓取网页内容 解析gzip格式输入流数据并转换为String格式字符串