您的位置：首页 > 理论基础 > 计算机网络

java解析html 内有萌妹子~Jsoup+Httpcient

2015-04-04 23:34 429 查看

如何入门 Python 爬虫？

昨天在知乎看到python版的爬虫抓淘宝的萌妹子，答主这福利放的好，看的我手痒痒，今天放假闲来无事，做了个java版的，主要用到了httpclient和jsoup，好了，不多说，上代码：

要看萌妹子，首先得观察一下淘宝MM的请求，淘宝MM chrome右键审查元素，切换到network，随便找个页码点击一下，看到发出了一个请求

参数有好几个，这里主要注意currentPage和pageSize两个参数，从字面上来理解这两个参数的含义，大家应该没什么问题的

这个请求返回的格式大致如下

这里最主要的参数是userId，用于追妹子→_→

分析的差不多了，开始追~

引入相关jar的依赖

<dependency>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>

<span style="font-family:Microsoft YaHei;font-size:12px;">package com.yz.xiaomapi.apache.httpclient;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

/**
* @author <a href="mailto:yaozhen421524@163.com">arvin.yao</a>
* 2015年4月4日 下午6:59:45
*/
public class JsoupTest {

@Test
public void testJsoUp(){
String postUrl = "http://mm.taobao.com/tstar/search/tstar_model.do";
String getUrl = "http://mm.taobao.com/self/aiShow.htm";
Integer currentPage = 1;
Integer totalPage = 1;
String localFileBasePath = "E:/dev/resources/meizi/";
HttpClient client = HttpClientBuilder.create().build();
while(currentPage <= totalPage){
HttpPost post = new HttpPost(postUrl);
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("_input_charset", "utf-8"));
params.add(new BasicNameValuePair("viewFlag", "A"));
params.add(new BasicNameValuePair("sortType", "total_favor_num"));
params.add(new BasicNameValuePair("searchRegion", "city:"));
params.add(new BasicNameValuePair("currentPage", currentPage.toString()));
params.add(new BasicNameValuePair("pageSize", "100"));
try {
HttpEntity formEntity = new UrlEncodedFormEntity(params);
post.setEntity(formEntity);
HttpResponse response = client.execute(post);//发请求获取当前页的妹子信息
currentPage++;//准备拿下一页#^_^#
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
InputStream is = response.getEntity().getContent();
String result = inStream2String(is);
JSONObject jsonObject = JSONObject.fromObject(result);
JSONObject dataObject = jsonObject.getJSONObject("data");
totalPage = (Integer)jsonObject.get("totalPage");
//当前页的妹子，好紧张
JSONArray searchDOList = dataObject.getJSONArray("searchDOList");
@SuppressWarnings("unchecked")
Iterator<JSONObject> iterator = searchDOList.iterator();
while (iterator.hasNext()) {
JSONObject searchDO = (JSONObject) iterator.next();
Integer userId = (Integer)searchDO.get("userId");
String realName="";
//拿到userId，追这个妹子去
Document document = Jsoup.connect(getUrl).data("userId", userId.toString()).get();
if(null == document){
continue;
}
Elements elements1 = document.select(".mm-p-model-info-left-top > dl > dd > a");
if(null == elements1){
continue;
}
Iterator<Element> iter = elements1.iterator();
while (iter.hasNext()) {
Element element = (Element) iter.next();
realName = element.text();
}
File mzDir = new File(localFileBasePath+realName+userId);
if(!mzDir.exists()){
mzDir.mkdirs();
}

Elements elements = document.select("img");
if(null == elements || null == elements.iterator()){
continue;
}
Iterator<Element> iterator2 = elements.iterator();
while (iterator2.hasNext()) {
Element element = (Element) iterator2.next();
String src = element.attr("src");
String imgName = src.substring(src.lastIndexOf("/")+1, src.length());
if(src.indexOf("http") == -1){
continue;
}
if("T1zA8tXg4EXXXXXXXX-596-176.png".equals(imgName)){//说明没找到这个妹子::>_<::
break;
}
URL url = new URL(src);
InputStream inputStream;
try {
inputStream = url.openStream();
} catch (FileNotFoundException e) {
System.out.println("文件不存在："+src);
continue;
}
System.out.println("开始下载："+src);
BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
byte[] bytes = new byte[1024];
//找个地方存起来 Y(^_^)Y
OutputStream outputStream = new FileOutputStream(new File(mzDir.getAbsolutePath()+"\\"+imgName));
int len;
while ( (len = bufferedInputStream.read(bytes)) > 0) {
outputStream.write(bytes, 0, len);
}
bufferedInputStream.close();
outputStream.flush();
outputStream.close();
System.out.println("下载完成："+src);
System.out.println("保存路径："+mzDir.getAbsolutePath()+"\\"+imgName);
}
}
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

// 将输入流转换成字符串
private String inStream2String(InputStream is) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buf = new byte[1024];
int len = -1;
while ((len = is.read(buf)) != -1) {
baos.write(buf, 0, len);
}
return new String(baos.toByteArray());
}

}
</span>

下载下来的妹子图格式大概是这样子的，就是这样~

使用 jsoup 对 HTML 文档进行解析和操作

使用jsoup解析html主要参考了这里，写的蛮不错的~感谢

小麻批

最近闲着无聊在弄一个妹子图片的小网站，正好抓点资源，大家有什么好的网站推荐下啊~

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航