您的位置：首页 > 数据库

用Jsoup写一个简单的爬虫，并把从网页上面爬下的数据保存到数据库中

2017-08-10 19:01 603 查看

今天研究了Jsoup的一些基本用法，来与大家一块分享一下。我是先把从网页上面的数据做成了一个对象，然后把对象存进HashMap中，最后通过JDBC再保存到数据库中。

今天要进行爬虫的网站是智联招聘。每一条招聘信息都可以看成是一个对象。那么就要有一个自定义的JavaBeen类。

其中要用到两个jar包，分别是：jsoup-1.10.3.jar 和 mysql-connector-java-5.1.39.jar 两个包。

package javabeen;

import java.util.Date;

/**
* 工作自定义类
*
* @author xml
*/
public class Job {
private String position;// 职位
private String company;// 公司名
private String compensation;// 薪资
private String workplace;// 工作地点
private String date;// 发布日期
private String education;// 学历
private String experience;// 工作经验
private String type;// 职位类别
private String number;// 工作人数
private String jobdescription;// 职位描述
private String comdescription;// 公司描述
public Job(String position, String company, String compensation, String workplace, String date, String education,
String experience, String type, String number, String jobdescription, String comdescription) {
super();
this.position = position;
this.company = company;
this.compensation = compensation;
this.workplace = workplace;
this.date = date;
this.education = education;
this.experience = experience;
this.type = type;
this.number = number;
this.jobdescription = jobdescription;
this.comdescription = comdescription;
}
public Job() {
super();
}
/**
* @return the position
*/
public String getPosition() {
return position;
}
/**
* @param position the position to set
*/
public void setPosition(String position) {
this.position = position;
}
/**
* @return the company
*/
public String getCompany() {
return company;
}
/**
* @param company the company to set
*/
public void setCompany(String company) {
this.company = company;
}
/**
* @return the compensation
*/
public String getCompensation() {
return compensation;
}
/**
* @param compensation the compensation to set
*/
public void setCompensation(String compensation) {
this.compensation = compensation;
}
/**
* @return the workplace
*/
public String getWorkplace() {
return workplace;
}
/**
* @param workplace the workplace to set
*/
public void setWorkplace(String workplace) {
this.workplace = workplace;
}
/**
* @return the date
*/
public String getDate() {
return date;
}
/**
* @param date the date to set
*/
public void setDate(String date) {
this.date = date;
}
/**
* @return the education
*/
public String getEducation() {
return education;
}
/**
* @param education the education to set
*/
public void setEducation(String education) {
this.education = education;
}
/**
* @return the experience
*/
public String getExperience() {
return experience;
}
/**
* @param experience the experience to set
*/
public void setExperience(String experience) {
this.experience = experience;
}
/**
* @return the type
*/
public String getType() {
return type;
}
/**
* @param type the type to set
*/
public void setType(String type) {
this.type = type;
}
/**
* @return the number
*/
public String getNumber() {
return number;
}
/**
* @param number the number to set
*/
public void setNumber(String number) {
this.number = number;
}
/**
* @return the jobdescription
*/
public String getJobdescription() {
return jobdescription;
}
/**
* @param jobdescription the jobdescription to set
*/
public void setJobdescription(String jobdescription) {
this.jobdescription = jobdescription;
}
/**
* @return the comdescription
*/
public String getComdescription() {
return comdescription;
}
/**
* @param comdescription the comdescription to set
*/
public void setComdescription(String comdescription) {
this.comdescription = comdescription;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "Job [position=" + position + ", company=" + company + ", compensation=" + compensation + ", workplace="
+ workplace + ", date=" + date + ", education=" + education + ", experience=" + experience + ", type="
+ type + ", number=" + number + ", jobdescription=" + jobdescription + ", comdescription="
+ comdescription + "]";
}

}

然后我们进行爬虫操作

package control;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.ListIterator;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javabeen.Job;
import utils.datautils;

public class Spider {

static String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=java&sm=0&p=3";
static int i = 0;
public static void body1() throws IOException{

Document doc = Jsoup.connect(url).get();

Element element =  doc.select("div#newlist_list_content_table").first();
//		System.out.println(element);
// 进一步获取table元素对应的对象 Elements
Elements tables = element.select("table");
HashMap<Integer, Job>hMap = new HashMap<Integer, Job>();
ListIterator<Element> listIter = tables.listIterator(1 );
while(listIter.hasNext()){
//			System.out.println(listIter.next());

Element table = listIter.next();

Element link = table.select("tr>td.zwmc").select("a").first();
Element link1 = table.select("tr>td.gsmc").select("a").first();
Element link2 = table.select("tr>td.zwyx").first();
Element link3 = table.select("tr>td.gzdd").first();
Element link4 = table.select("tr>td.gxsj").select("span").first();
Element link5 = table.select("tr>td.gxsj").select("span").first();
Job job = new Job();
String url1=link.attr("href").toString();
Document doc1 = Jsoup.connect(url1).get();
Element element1 =  doc1.select("ul#terminal-ul clearfix").first();

//			System.out.println(element1+"===");
link.text();
i++;

job.setPosition(link.text().toString());
job.setCompany(link1.text().toString());
job.setCompensation(link2.text().toString());
job.setWorkplace(link3.text().toString());
job.setDate(link4.text().toString());
hMap.put(i, job);
}

Set<Integer> keys = hMap.keySet();

for(Integer key:keys){
Job value = hMap.get(key);
Connection conn = null;
try {
PreparedStatement ps = null;
conn = datautils.getConnection();
String sql = "insert into job(position,company,compensation,workplace,date)values(?,?,?,?,?)";
ps = conn.prepareStatement(sql);
ps.setString(1, value.getPosition());
ps.setString(2, value.getCompany());
ps.setString(3, value.getCompensation());
ps.setString(4, value.getWorkplace());
ps.setString(5, value.getDate());
ps.executeUpdate();
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据库访问失败");
}
System.out.println(key+","+value.toString());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
body1();
}
}

要访问数据库，所以要用一个数据库个工具类：

package utils;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

public class datautils {
public static final String Driver = "com.mysql.jdbc.Driver";
public static final String url = "jdbc:mysql://localhost:3306/nyb?useSSL=true";
public static final String user = "root";
public static final String password = "123456";

static {
try {
Class.forName(Driver);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public static Connection getConnection() throws SQLException{
Connection conn = null;
conn = DriverManager.getConnection(url,user,password);
return conn;
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： java 爬虫数据对象 hashmap

相关文章推荐

新的分享

章节导航