您的位置：首页 > 理论基础 > 计算机网络

Ending、网络爬虫-HttpClient系列

2017-06-14 16:51 197 查看

HttpClient（我也是不太理解的）

提问一下

先是说下自己跟着学的《自己动手写网络爬虫》，不过一直运行出错，感觉可能是post方式不太对，不过好像换了get也是同样的错误，以下代码希望大神帮忙解答。

package cn.sitron.worm.text;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

/**
* java网页抓取示例
* @author Ending、
* 备注：（引用commons-httpclient.jar）（commons-logging.jar）(commons-codec.jar)
*/
public class RetrivePage {
private static HttpClient httpClient=new HttpClient();
//设置代理服务器
static{
//设置代理服务器的IP地址和端口
httpClient.getHostConfiguration().setProxy("118.144.176.5", 3128);
}

public static boolean downloadPage(String path) throws HttpException,IOException{
InputStream input=null;
OutputStream output=null;
//得到post方法
// PostMethod postMethod=new PostMethod(path);
GetMethod getMethod=new GetMethod(path);
//设置post方法的参数
// NameValuePair[] postData=new NameValuePair[2];
// postData[0] =new NameValuePair("name","lietu");
// postData[1] =new NameValuePair("password","*****");
// postMethod.addParameters(postData);
//执行，返回状态码
int statusCode = httpClient.executeMethod(getMethod);
//针对状态码进行处理（简单起见，只处理返回值为200的状态码）
if(statusCode==HttpStatus.SC_OK){
input=getMethod.getResponseBodyAsStream();
//得到文件名
String fileName= path.substring(path.lastIndexOf('/')+1);
//获得文件输出流
output=new FileOutputStream(fileName);
//输出到文件
int tempByte=1;
while((tempByte=input.read())>0){
System.out.println("SUCCESS");
output.write(tempByte);
}
//关闭输入输出流
if(input!=null){
input.close();
}
if(output!=null){
output.close();
}
return true;
}
return false;
}

/**
* 测试代码
*/
public static void main(String[] args) {
//抓取lietu首页，输出
try {
RetrivePage.downloadPage("http://www.baidu.com/");
} catch (Exception e) {
// TODO: handle exception
}
}
}

案例环节

这是感觉也是看httpClient手册敲的，不过在post方式上依旧有同样的问题，post方式使用了代理公布器（以下下载方式：下载链接），就为了获取代理IP地址和端口，然而我等了10分钟只给我一条可用的数据

接下来我用了新的方式抓取网页，以下代码

package cn.sitron.worm.text;

import java.io.IOException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

/**
* httpclient——get、host方法
* @author Ending、
*
*/
public class EasyPage {
//代理服务器如果是HTTPS，就讲页面改为https
private static final String PATH="http://java.sun.com";
public static void main(String[] args) throws IOException{
HttpClient httpClient=new HttpClient();
//使用get方法
HttpMethod method=new GetMethod(PATH);
httpClient.executeMethod(method);
//打印服务器返回状态
System.out.println(method.getStatusLine());
//打印返回信息
System.out.println(method.getResponseBodyAsString());
//释放信息
method.releaseConnection();
System.out.println("----------------------------------");
System.out.println("----------------------------------");
EasyPage.post();
}

/**
* 301报错提示、信息: Redirect requested but followRedirects is disabled
* @throws IOException
*/
public static void post() throws IOException,HttpException{
HttpClient client=new HttpClient();
client.getHostConfiguration().setProxy("118.144.176.5", 3128);
HttpMethod method=new PostMethod(PATH);
client.executeMethod(method);
System.out.println(method.getStatusLine());
int statusCode=client.executeMethod(method);
if(statusCode==301){
System.err.println("信息: Redirect requested but followRedirects is disabled");
System.err.println("信息: 重定向请求，followRedirects拒绝");
}else{
System.out.println(method.getResponseBodyAsString());
}
method.releaseConnection();
}
}对于get运行时没问题的，但是post···还是。

以上是下午上班突然想到的，抓取功能，不是有很多不足，希望大神留言分享经验，有比我还要小白（比我还不懂）的也可以留言，我也可以出些基础的文档。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： java 网络爬虫代理服务器 class

相关文章推荐

新的分享

章节导航