您的位置:首页 > 理论基础 > 计算机网络

java解析html 内有萌妹子~Jsoup+Httpcient

2015-04-04 23:34 429 查看
如何入门 Python 爬虫?

昨天在知乎看到python版的爬虫抓淘宝的萌妹子,答主这福利放的好,看的我手痒痒,今天放假闲来无事,做了个java版的,主要用到了httpclient和jsoup,好了,不多说,上代码:

要看萌妹子,首先得观察一下淘宝MM的请求,淘宝MM chrome右键审查元素,切换到network,随便找个页码点击一下,看到发出了一个请求



参数有好几个,这里主要注意currentPage和pageSize两个参数,从字面上来理解这两个参数的含义,大家应该没什么问题的

这个请求返回的格式大致如下



这里最主要的参数是userId,用于追妹子→_→

分析的差不多了,开始追~

引入相关jar的依赖

<dependency>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>


<span style="font-family:Microsoft YaHei;font-size:12px;">package com.yz.xiaomapi.apache.httpclient;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

/**
* @author <a href="mailto:yaozhen421524@163.com">arvin.yao</a>
* 2015年4月4日 下午6:59:45
*/
public class JsoupTest {

@Test
public void testJsoUp(){
String postUrl = "http://mm.taobao.com/tstar/search/tstar_model.do";
String getUrl = "http://mm.taobao.com/self/aiShow.htm";
Integer currentPage = 1;
Integer totalPage = 1;
String localFileBasePath = "E:/dev/resources/meizi/";
HttpClient client = HttpClientBuilder.create().build();
while(currentPage <= totalPage){
HttpPost post = new HttpPost(postUrl);
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("_input_charset", "utf-8"));
params.add(new BasicNameValuePair("viewFlag", "A"));
params.add(new BasicNameValuePair("sortType", "total_favor_num"));
params.add(new BasicNameValuePair("searchRegion", "city:"));
params.add(new BasicNameValuePair("currentPage", currentPage.toString()));
params.add(new BasicNameValuePair("pageSize", "100"));
try {
HttpEntity formEntity = new UrlEncodedFormEntity(params);
post.setEntity(formEntity);
HttpResponse response = client.execute(post);//发请求获取当前页的妹子信息
currentPage++;//准备拿下一页#^_^#
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
InputStream is = response.getEntity().getContent();
String result = inStream2String(is);
JSONObject jsonObject = JSONObject.fromObject(result);
JSONObject dataObject = jsonObject.getJSONObject("data");
totalPage = (Integer)jsonObject.get("totalPage");
//当前页的妹子,好紧张
JSONArray searchDOList = dataObject.getJSONArray("searchDOList");
@SuppressWarnings("unchecked")
Iterator<JSONObject> iterator = searchDOList.iterator();
while (iterator.hasNext()) {
JSONObject searchDO = (JSONObject) iterator.next();
Integer userId = (Integer)searchDO.get("userId");
String realName="";
//拿到userId,追这个妹子去
Document document = Jsoup.connect(getUrl).data("userId", userId.toString()).get();
if(null == document){
continue;
}
Elements elements1 = document.select(".mm-p-model-info-left-top > dl > dd > a");
if(null == elements1){
continue;
}
Iterator<Element> iter = elements1.iterator();
while (iter.hasNext()) {
Element element = (Element) iter.next();
realName = element.text();
}
File mzDir = new File(localFileBasePath+realName+userId);
if(!mzDir.exists()){
mzDir.mkdirs();
}

Elements elements = document.select("img");
if(null == elements || null == elements.iterator()){
continue;
}
Iterator<Element> iterator2 = elements.iterator();
while (iterator2.hasNext()) {
Element element = (Element) iterator2.next();
String src = element.attr("src");
String imgName = src.substring(src.lastIndexOf("/")+1, src.length());
if(src.indexOf("http") == -1){
continue;
}
if("T1zA8tXg4EXXXXXXXX-596-176.png".equals(imgName)){//说明没找到这个妹子::>_<::
break;
}
URL url = new URL(src);
InputStream inputStream;
try {
inputStream = url.openStream();
} catch (FileNotFoundException e) {
System.out.println("文件不存在:"+src);
continue;
}
System.out.println("开始下载:"+src);
BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
byte[] bytes = new byte[1024];
//找个地方存起来 Y(^_^)Y
OutputStream outputStream = new FileOutputStream(new File(mzDir.getAbsolutePath()+"\\"+imgName));
int len;
while ( (len = bufferedInputStream.read(bytes)) > 0) {
outputStream.write(bytes, 0, len);
}
bufferedInputStream.close();
outputStream.flush();
outputStream.close();
System.out.println("下载完成:"+src);
System.out.println("保存路径:"+mzDir.getAbsolutePath()+"\\"+imgName);
}
}
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

// 将输入流转换成字符串
private String inStream2String(InputStream is) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buf = new byte[1024];
int len = -1;
while ((len = is.read(buf)) != -1) {
baos.write(buf, 0, len);
}
return new String(baos.toByteArray());
}

}
</span>




下载下来的妹子图格式大概是这样子的,就是这样~

使用 jsoup 对 HTML 文档进行解析和操作

使用jsoup解析html主要参考了这里,写的蛮不错的~感谢

小麻批

最近闲着无聊在弄一个妹子图片的小网站,正好抓点资源,大家有什么好的网站推荐下啊~
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: