您的位置:首页 > 编程语言 > Java开发

Java爬虫-利用jsoup(抓安居客房源)

2017-09-25 15:51 232 查看
利用java的jsoup写了一个小爬虫,自己测试玩,抓起安居客的房源信息。

主要是解析html代码元素。同理可以解析其他网页html。

抓多了IP会被踢出,然后验证。

可以翻墙或者动态IP去抓取数据。我这边就是翻墙去测试的。

package com.fangejia.web.admin.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* Created by Author: lihongliang on 2017/3/1.
* Email: 425631071@qq.com
* Description:
*/
public class jsoupTest {

public static void main(String[] args) {
try {
new jsoupTest().AnjukeRentHouse();
} catch (Exception e) {
e.printStackTrace();
}
}

/**
*
* @throws Exception
*/
public void AnjukeRentHouse() throws Exception{
int pageNumber = 1;
Document houseDoc = Jsoup.connect("http://chengdu.anjuke.com/sale/p"+pageNumber).get();
//html表情可能变了
Element houselistModel = houseDoc.getElementById("houselist-mod-new");
if(houselistModel!=null){
Elements houselistMod = houselistModel.getElementsByTag("li");
for (Element houseList : houselistMod) {
Elements houseSrc = houseList.getElementsByTag("img");
String houseHref = houseSrc.attr("src");
System.out.println("房源列表 图片地址 :  "+houseHref);

Elements houseTitle = houseList.getElementsByTag("a");
String houseTitleString = houseTitle.attr("title");
String houseTitleHref = houseTitle.attr("href");
System.out.println("房源列表 标题 : "+houseTitleHref);
System.out.println("房源列表 跳转地址 :  "+houseTitleHref);

Element detailsItem = houseList.getElementsByClass("details-item").get(0);
String detailsItemText = detailsItem.text();
System.out.println("房源列表 item one : "+detailsItemText);

Element detailsItem2 = houseList.getElementsByClass("details-item").get(1);
String detailsItemText2 = detailsItem2.text();
System.out.println("房源列表 item two : "+detailsItemText2);

Elements brokerName = houseList.getElementsByClass("broker-name");
String brokerNameText = brokerName.text();
System.out.println("房源列表 姓名: "+brokerNameText);

//详情
Document houseDetailDoc = Jsoup.connect(houseTitleHref).get();
Element content = houseDetailDoc.getElementById("content");
Elements lonTtitle = content.getElementsByClass("long-title");
System.out.println("房源详情title : "+lonTtitle.text());

Elements wrapperLfClearfix = content.getElementsByClass("wrapper-lf clearfix");
Elements basicInfoClearfix = wrapperLfClearfix.get(0).getElementsByClass("basic-info clearfix");

System.out.println(basicInfoClearfix.get(0).getElementsByTag("light info-tag").text());
System.out.println("============================================================================================="+pageNumber);
}
}else {
System.out.println("数据返回为空,获取数据失败!");
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息