httpclient/jsoup模拟登陆人人网
2013-09-12 18:01
453 查看
HttpClient(DefaultHttpClient)代表了一个会话,在同一个会话中,HttpClient对cookie自动进行管理(当然,也可以在程序中进行控制)。
在同一个会话中,当使用post或是get发起一个新的请求时,一般需要对调用前一个会话的abort()方法,否则会抛出异常。
有些网站登录成功后会重定向(302, 303),比如这里的人人网。如果发出的是post请求,需要从响应头中取出location,并再次向网站发送请求,以获取最终数据。
抓取程序不要运行地过于频繁,大部分站点都有抵制刷网站机制。人人网访问过于频繁会锁账号。
代码如下:
import java.io.IOException; import java.io.InterruptedIOException; import java.io.UnsupportedEncodingException; import java.net.ConnectException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.List; import javax.net.ssl.SSLException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpEntityEnclosingRequest; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; public class ClientConnect { DefaultHttpClient httpclient = new DefaultHttpClient(); HttpPost httppost = new HttpPost(); SaveFiile savefile = new SaveFiile(); HttpResponse response; HttpEntity entity; HttpRequestRetryHandler retryhandler; String username=" ... ", password=" ..." ; //你自己的登陆名称和密码进行初始化 String url; public ClientConnect(String url) { this.url = url; } private boolean login() { List<NameValuePair> formparams = new ArrayList<NameValuePair>(); formparams.add(new BasicNameValuePair("email", username)); formparams.add(new BasicNameValuePair("password", password)); formparams.add(new BasicNameValuePair("origURL", "http://www.renren.com/home")); formparams.add(new BasicNameValuePair("domain", "renren.com")); formparams.add(new BasicNameValuePair("key_id", "1")); formparams.add(new BasicNameValuePair("captcha_type", "web_login")); UrlEncodedFormEntity Urlentity; try { Urlentity = new UrlEncodedFormEntity(formparams, "UTF-8"); httppost = new HttpPost(url); httppost.setEntity(Urlentity); response = httpclient.execute(httppost); this.retryconnection(); // 调用连接错误处理函数 } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } finally { httppost.abort(); } return true; } // 登陆后页面会重定向.因此需从响应头中取出定向后的url. private String getRedirectLocation() { Header locationHeader = response.getFirstHeader("Location"); if (locationHeader == null) { return null; } return locationHeader.getValue(); } // 获取从定向后的html内容 private String getText(String redirection) { HttpGet httpget = new HttpGet(redirection); retryconnection(); String html = ""; try { response = httpclient.execute(httpget); int statu = response.getStatusLine().getStatusCode(); if (statu == HttpStatus.SC_OK) { entity = response.getEntity(); if (entity != null) { html = EntityUtils.toString(entity); } } } catch (ClientProtocolException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e2) { e2.printStackTrace(); } finally { httpclient.getConnectionManager().shutdown(); } return html; } private void retryconnection() { // to set request retry handler retryhandler = new HttpRequestRetryHandler() { @Override public boolean retryRequest(IOException exception, int exceutionCount, HttpContext context) { // TODO Auto-generated method stub if (exceutionCount >= 5) { // Do not retry if over max retry count return false; } if (exception instanceof InterruptedIOException) { // Timeout return false; } if (exception instanceof UnknownHostException) { // unknown host return false; } if (exception instanceof ConnectException) { // connection refused return false; } if (exception instanceof SSLException) { // SSL handshake excetion return false; } HttpRequest request = (HttpRequest) context .getAttribute(ExecutionContext.HTTP_REQUEST); boolean idempotent = !(request instanceof HttpEntityEnclosingRequest); if (idempotent) { // Retry if the request is considered idempotent return true; } return false; } }; httpclient.setHttpRequestRetryHandler(retryhandler); } public void clientConnetion() { String html = ""; if (login()) { String redirection = getRedirectLocation(); if (redirection != null) { // System.out.println(getText(redirection)); html = getText(redirection); savefile.saveToFile(url, html); } } } }
import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; public class SaveFiile { public void saveToFile(String url, String htmlentity) { String filename = url.substring(11); filename = filename.replace("/", ".") + ".html"; try { BufferedWriter write = new BufferedWriter(new FileWriter(filename)); write.write(htmlentity); write.flush(); write.close(); System.out.println("file save successfully!"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
public class LoadRenRen { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub ClientConnect clientconnect = new ClientConnect( "http://www.renren.com/PLogin.do"); clientconnect.clientConnetion(); } }
相关文章推荐
- 安卓HttpClient+Jsoup+Httpwatch模拟登陆正方教务获取信息
- HttpClient+Jsoup模拟登陆,解析HTML,信息筛选(广工图书馆)
- C#HttpClient或使用CookieContainer模拟登陆后HttpRequest不发送cookie的解决方法及原因
- Java httpClient 正方教务管理系统模拟登陆,爬取学生成绩和培养计划
- HttpClient使用之模拟登录人人网(Post请求)
- httpclient模拟登陆操作实现
- (转)HttpClient 模拟登陆,保持会话并进行后续操作
- 05_HttpClient_模拟登陆
- apache + httpclient4 + jsoup 进行模拟浏览器url访问
- HTTPclient模拟登陆交大图书馆---图书馆客户端
- httpclient 模拟登陆
- PHP HttpClient模拟登陆
- HttpClient 模拟登陆,保持会话并进行后续操作
- HttpClient 模拟登陆百度 2015.10.21 JAVA
- HttpClient 模拟登陆知乎
- HttpClient 模拟登陆,保持会话并进行后续操作
- java httpclient 模拟登陆京东
- HttpClient4使用(模拟登陆)
- httpclient模拟登陆,不支持https的问题
- HttpClient 模拟登陆百度 2015.10.21 JAVA