您的位置:首页 > 编程语言 > Java开发

一个针对51job的过滤信息(培训)的java爬虫脚本,供大家参考,

2017-01-10 21:52 483 查看
//在51job搜索后再用此脚本过滤

package test;

import java.io.FileOutputStream;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.parser.Tag;

import org.jsoup.select.Elements;

import org.junit.Test;

public class demo {

// 设置51job职位列表首页面的url
public static String startUrl = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=060000%2C00&funtype=0000&industrytype=00&keyword=Java&keywordtype=2&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9";
// 设置过滤公司
public static String[] companys = { "中软","达内" };
// 设置过滤关键字
public static String[] keywords = { "培训", "实训" ,"零基础"};
// 保存信息StringBuffer
public static StringBuffer buffer=new StringBuffer();

//执行
public static void main(String[] args) throws Exception {
test(startUrl);
saveDetail();
}

@Test
// 主页面获取职位衔接
public static void test(String url) throws Exception {
Document mainDocument = null;
try {
mainDocument = Jsoup.connect(url).get();
} catch (Exception e) {
// TODO: handle exception
return;
}
// 获取下一页衔接
Elements pageList = mainDocument.getElementsByClass("bk");
String nextPageUrl = pageList.get(1).getAllElements().attr("href");

// 遍历职位
Elements jobList = mainDocument.getElementsByClass("dw_table");
Element jobsElement = jobList.get(0);
// 获取职位集
Elements jobs = jobsElement.getElementsByClass("el");

// 获取所有职位
for (Element job : jobs) {
int flag=0;
//过滤公司标记变量
String companyName=job.text();
for (String company : companys) {
if (companyName.contains(company)) {
System.out.println("namepass:" + companyName);
flag=1;
//过滤公司
}
}
if(flag==1){
continue;
}
// 穿过元素t1,a,href获取职位衔接
Elements t1Nodes = job.getElementsByClass("t1");
Element t1Node = t1Nodes.get(0);
Elements aNodes = t1Node.getElementsByTag("a");
String jobUrl = aNodes.attr("href");
// 开始检查关键字
if(checkDetail(jobUrl)){
buffer.append("<a target='_blank' href='"+jobUrl+"'>"+companyName+"</a></br></br>");
}else{
System.out.println("contentpass:"+companyName);
}
}

// 浏览下一页
if (!"".equals(nextPageUrl)) {
test(nextPageUrl);
} else {
System.out.println("浏览结束");
}
}

// 模式一: 检查职位详情内容,显示通过
public static boolean checkDetail(String jobUrl) throws Exception {
if (!"".equals(jobUrl)) {
Document jobDocument = null;
try {
jobDocument = Jsoup.connect(jobUrl).get();
} catch (Exception e) {
// TODO: handle exception
return false;
}
Elements jobDetails = jobDocument
.getElementsByClass("tCompany_main");
// 获取到职位详情
String detail = jobDetails.text();
// 根据关键字筛选
for (String keyword : keywords) {
if (detail.contains(keyword)) {
return false;
//过滤不通过
}
}
return true;
//过滤通过
}
return false;
}

//保存通过信息
public static void saveDetail() throws Exception {
FileOutputStream fos=new FileOutputStream("d://test.html");
fos.write(buffer.toString().getBytes());
fos.close();
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: