您的位置:首页 > 编程语言 > Java开发

java-爬虫部分:关于京东模拟登陆的两种实现

2014-02-27 18:33 656 查看
最近要做一个爬虫,需要网站数据,先拿京东开刀。

因为我是java开发的,所以最开始的时候,想到了httpClient和htmlunit两个东东,于是开始做实验。

网上很久以前流传着一个登陆人人网的例子,我就拿过来照搬了一下,发现不灵,后来才发现是自己没理解人家的精髓。然后用htmlunit去模拟,发现京东的js比较复杂,一位多年爬虫经验的哥们告诉我说htmlunit对js支持的不好,有些网站就是不灵的。没办法,自己想吧。

(1)打开京东的登陆页面,看他的源码,发现是执行了一个ajax,具体链接是:https://passport.jd.com/uc/loginService?uuid=f5c0dd5a-762c-4230-b8c0-f70589b7dbdb&ReturnUrl=http://order.jd.com/center/list.action&r=0.66408410689742&loginname=username&nloginpwd=xxxxxx&loginpwd=xxxxxx&machineNet=&machineCpu=&machineDisk=&authcode=&saHrhnkIIX=GXgVo

每次刷新页面,uuid和最后一个参数都是不一样的。然后在火狐打开登陆页,把参数拼在一起后,直接访问火狐,没问题,登陆成功;但是在火狐打开登陆页,把参数拼起来后,在IE却不能打开。OK,看来是在cookie里存了一些东西后面做验证了。

基于以上分析,做了第一套代码:

核心代码如下:

package com.lkb.test;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import org.apache.http.HttpResponse;

import org.apache.http.client.ResponseHandler;

import org.apache.http.client.entity.UrlEncodedFormEntity;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.methods.HttpPost;

import org.apache.http.impl.client.BasicResponseHandler;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.message.BasicNameValuePair;

import org.apache.http.message.BufferedHeader;

import org.apache.http.protocol.HTTP;

public class JD {

// The configuration items

private static String userName = "xxx";

private static String password = "yyy";

private static String redirectURL = "http://order.jd.com/center/list.action";

private static String loginUrl = "http://passport.jd.com/uc/login";

// Don't change the following URL

private static String renRenLoginURL = "https://passport.jd.com/uc/loginService";

// The HttpClient is used in one session

private HttpResponse response;

private DefaultHttpClient httpclient = new DefaultHttpClient();

public Map<String,String> getParams(){

Map<String,String> map = new HashMap<String,String>();

String str = getText(loginUrl);

String strs1[] = str.split("name=\"uuid\" value=\"");

String strs2[] = strs1[1].split("\"/>");

String uuid = strs2[0];

map.put("uuid", uuid);

System.out.println(strs2[0]);

String str3s[] = strs1[1].split("<span class=\"clr\"></span><input type=\"hidden\" name=\"");

String strs4[] = str3s[1].split("/>");

String strs5[] = strs4[0].trim().split("\"");

String key = strs5[0];

String value = strs5[2];

map.put(key, value);

return map;

}

private boolean login() {

Map map = getParams();

HttpPost httpost = new HttpPost(renRenLoginURL);

// All the parameters post to the web site

List<BasicNameValuePair> nvps = new ArrayList<BasicNameValuePair>();

nvps.add(new BasicNameValuePair("ReturnUrl", redirectURL));

nvps.add(new BasicNameValuePair("loginname", userName));

nvps.add(new BasicNameValuePair("nloginpwd", password));

nvps.add(new BasicNameValuePair("loginpwd", password));

Iterator it = map.keySet().iterator();

while(it.hasNext()) {

String key = it.next().toString();

String value = map.get(key).toString();

nvps.add(new BasicNameValuePair(key, value));

}

try {

httpost.setEntity(new UrlEncodedFormEntity((List<? extends org.apache.http.NameValuePair>) nvps, HTTP.UTF_8));

response = httpclient.execute(httpost);

} catch (Exception e) {

e.printStackTrace();

return false;

} finally {

httpost.abort();

}

return true;

}

private String getRedirectLocation() {

BufferedHeader locationHeader = (BufferedHeader) response.getFirstHeader("Location");

if (locationHeader == null) {

return null;

}

return locationHeader.getValue();

}

private String getText(String redirectLocation) {

HttpGet httpget = new HttpGet(redirectLocation);

ResponseHandler<String> responseHandler = new BasicResponseHandler();

String responseBody = "";

try {

responseBody = httpclient.execute(httpget, responseHandler);

} catch (Exception e) {

e.printStackTrace();

responseBody = null;

} finally {

httpget.abort();

//httpclient.getConnectionManager().shutdown();

}

return responseBody;

}

public void printText() {

if (login()) {

System.out.println(getText(redirectURL));

String redirectLocation = getRedirectLocation();

if (redirectLocation != null) {

System.out.println(getText(redirectLocation));

}

}

}

public static void main(String[] args) {

JD renRen = new JD();

//renRen.getParams();

renRen.printText();

}

}

验证码解决:

/*

* 取得验证码图片

*/

public File getMarkFile(DefaultHttpClient httpclient,String check,final String picName) {

ResponseHandler<File> responseHandler = new ResponseHandler<File>() {

@Override

public File handleResponse(final HttpResponse response)

throws ClientProtocolException, IOException {

// TODO Auto-generated method stub

int status = response.getStatusLine().getStatusCode();

if (status >= 200 && status < 300) {

String tmpPath = System.getProperty("user.home")

+ File.separator + ".jd";

File pngf = new File(tmpPath);

if (!(pngf.exists() || pngf.isDirectory())) {

pngf.mkdirs();

}

java.text.DateFormat format2 = new java.text.SimpleDateFormat("yyyyMMdd");

String path = format2.format(new Date());

String authcodePath = InfoUtil.getInstance().getInfo("road", "authcodePath");

String filePath = authcodePath

+ path;

File file= new File(filePath);

//判断文件夹是否存在,如果不存在则创建文件夹

if (!file.exists()) {

file.mkdir();

}

String fullpath =
filePath +"\\" +picName;

HttpEntity entity = response.getEntity();

return entity != null ? filePutContents(fullpath,

entity.getContent()) : null;

} else {

throw new ClientProtocolException(

"Unexpected response status:" + status);

}

}

};

HttpGet httpGet = new HttpGet(check);

File f = null;

try {

f = httpclient.execute(httpGet, responseHandler);

} catch (ClientProtocolException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally {

httpGet.abort();

}

return f;

}

/*

* png存储为file

*/

private File filePutContents(String fileName, InputStream is) {

File file = new File(fileName);

OutputStream os = null;

try {

os = new FileOutputStream(file);

byte buffer[] = new byte[4 * 1024];

int len = 0;

while ((len = is.read(buffer)) != -1) {

os.write(buffer, 0, len);

}

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

os.close();

} catch (IOException e) {

e.printStackTrace();

}

}

return file;

}

(2)后来在实践的过程又在想,如果每个网站都这么复杂,如果人家要是改了实现方式怎么办,于是又找到了selenuim2,发现这个东东是个好东东,可以实现模拟登陆,但是有一个缺点是要弹出页面,因为刚开始试验这个,所以还不熟悉。还有一点是你的操作需要设置sleep时间,不然会出问题。关于这一点还需要大家帮我改进一下,核心代码如下:

package com.lkb;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebDriver.Navigation;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.firefox.FirefoxDriver;

public class JDTest {

public static void main(String[] args) {

JDTest jd = new JDTest();

jd.connection();

}

public void connection(){

WebDriver driver = new FirefoxDriver();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

Navigation navigation = driver.navigate();

navigation.to("https://passport.360buy.com/new/login.aspx");

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

WebElement loginName = driver.findElement(By.id("loginname"));

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

loginName.sendKeys(Constant.USERNAME);

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

WebElement loginPwd = driver.findElement(By.id("nloginpwd"));

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

loginPwd.sendKeys(Constant.password);

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

WebElement loginButton = driver.findElement(By.id("loginsubmit"));

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

waitForSecond();

loginButton.click();

waitForSecond();

navigation.to("http://order.jd.com/center/list.action");

System.out.println(driver.getPageSource());

//driver.close();

}

public void waitForSecond()

{

try

{

Thread. sleep(1000);

}

catch (InterruptedException e)

{

e.printStackTrace();

}

}

}

以上的jar包和源码大家需要的话,可以联系我,QQ:369768231

对爬虫感兴趣的同学,请加我的Q群:101526096

后续还要做验证码的解决方案,有做过或者即将做的,也请加入Q群,一起讨论下。

开源才能进步,希望大家互相帮助,互相进步。

如果大家想找工作的话,可以联系我哈。我们公司招人,需要两种人

(1)技术大牛,我们的架构是hbase+hadoop那一套,以及自定义爬虫,所以需要牛逼的人进来解决各种问题,如果你觉得自己很牛逼,请联系我!

(2)java基础好,为人实在,热爱工作,对自己负责任,有创业精神的人。也可以联系我,我们这边有华尔街大牛,谷歌大牛,北大清华各种博士,对你自己是一个很好地提升!

办公地点在:北京中关村微软大厦。

公司网站:www.quantgroup.cn
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: