用Jsoup写一个简单的爬虫,并把从网页上面爬下的数据保存到数据库中
2017-08-10 19:01
603 查看
今天研究了Jsoup的一些基本用法,来与大家一块分享一下。我是先把从网页上面的数据做成了一个对象,然后把对象存进HashMap中,最后通过JDBC再保存到数据库中。
今天要进行爬虫的网站是智联招聘。每一条招聘信息都可以看成是一个对象。那么就要有一个自定义的JavaBeen类。
其中要用到两个jar包,分别是 :jsoup-1.10.3.jar 和 mysql-connector-java-5.1.39.jar 两个包。
要访问数据库,所以要用一个数据库个工具类:
今天要进行爬虫的网站是智联招聘。每一条招聘信息都可以看成是一个对象。那么就要有一个自定义的JavaBeen类。
其中要用到两个jar包,分别是 :jsoup-1.10.3.jar 和 mysql-connector-java-5.1.39.jar 两个包。
package javabeen; import java.util.Date; /** * 工作自定义类 * * @author xml */ public class Job { private String position;// 职位 private String company;// 公司名 private String compensation;// 薪资 private String workplace;// 工作地点 private String date;// 发布日期 private String education;// 学历 private String experience;// 工作经验 private String type;// 职位类别 private String number;// 工作人数 private String jobdescription;// 职位描述 private String comdescription;// 公司描述 public Job(String position, String company, String compensation, String workplace, String date, String education, String experience, String type, String number, String jobdescription, String comdescription) { super(); this.position = position; this.company = company; this.compensation = compensation; this.workplace = workplace; this.date = date; this.education = education; this.experience = experience; this.type = type; this.number = number; this.jobdescription = jobdescription; this.comdescription = comdescription; } public Job() { super(); } /** * @return the position */ public String getPosition() { return position; } /** * @param position the position to set */ public void setPosition(String position) { this.position = position; } /** * @return the company */ public String getCompany() { return company; } /** * @param company the company to set */ public void setCompany(String company) { this.company = company; } /** * @return the compensation */ public String getCompensation() { return compensation; } /** * @param compensation the compensation to set */ public void setCompensation(String compensation) { this.compensation = compensation; } /** * @return the workplace */ public String getWorkplace() { return workplace; } /** * @param workplace the workplace to set */ public void setWorkplace(String workplace) { this.workplace = workplace; } /** * @return the date */ public String getDate() { return date; } /** * @param date the date to set */ public void setDate(String date) { this.date = date; } /** * @return the education */ public String getEducation() { return education; } /** * @param education the education to set */ public void setEducation(String education) { this.education = education; } /** * @return the experience */ public String getExperience() { return experience; } /** * @param experience the experience to set */ public void setExperience(String experience) { this.experience = experience; } /** * @return the type */ public String getType() { return type; } /** * @param type the type to set */ public void setType(String type) { this.type = type; } /** * @return the number */ public String getNumber() { return number; } /** * @param number the number to set */ public void setNumber(String number) { this.number = number; } /** * @return the jobdescription */ public String getJobdescription() { return jobdescription; } /** * @param jobdescription the jobdescription to set */ public void setJobdescription(String jobdescription) { this.jobdescription = jobdescription; } /** * @return the comdescription */ public String getComdescription() { return comdescription; } /** * @param comdescription the comdescription to set */ public void setComdescription(String comdescription) { this.comdescription = comdescription; } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { return "Job [position=" + position + ", company=" + company + ", compensation=" + compensation + ", workplace=" + workplace + ", date=" + date + ", education=" + education + ", experience=" + experience + ", type=" + type + ", number=" + number + ", jobdescription=" + jobdescription + ", comdescription=" + comdescription + "]"; } }然后我们进行爬虫操作
package control; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.HashMap; import java.util.ListIterator; import java.util.Set; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import javabeen.Job; import utils.datautils; public class Spider { static String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=java&sm=0&p=3"; static int i = 0; public static void body1() throws IOException{ Document doc = Jsoup.connect(url).get(); Element element = doc.select("div#newlist_list_content_table").first(); // System.out.println(element); // 进一步获取table元素对应的对象 Elements Elements tables = element.select("table"); HashMap<Integer, Job>hMap = new HashMap<Integer, Job>(); ListIterator<Element> listIter = tables.listIterator(1 ); while(listIter.hasNext()){ // System.out.println(listIter.next()); Element table = listIter.next(); Element link = table.select("tr>td.zwmc").select("a").first(); Element link1 = table.select("tr>td.gsmc").select("a").first(); Element link2 = table.select("tr>td.zwyx").first(); Element link3 = table.select("tr>td.gzdd").first(); Element link4 = table.select("tr>td.gxsj").select("span").first(); Element link5 = table.select("tr>td.gxsj").select("span").first(); Job job = new Job(); String url1=link.attr("href").toString(); Document doc1 = Jsoup.connect(url1).get(); Element element1 = doc1.select("ul#terminal-ul clearfix").first(); // System.out.println(element1+"==="); link.text(); i++; job.setPosition(link.text().toString()); job.setCompany(link1.text().toString()); job.setCompensation(link2.text().toString()); job.setWorkplace(link3.text().toString()); job.setDate(link4.text().toString()); hMap.put(i, job); } Set<Integer> keys = hMap.keySet(); for(Integer key:keys){ Job value = hMap.get(key); Connection conn = null; try { PreparedStatement ps = null; conn = datautils.getConnection(); String sql = "insert into job(position,company,compensation,workplace,date)values(?,?,?,?,?)"; ps = conn.prepareStatement(sql); ps.setString(1, value.getPosition()); ps.setString(2, value.getCompany()); ps.setString(3, value.getCompensation()); ps.setString(4, value.getWorkplace()); ps.setString(5, value.getDate()); ps.executeUpdate(); conn.close(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("数据库访问失败"); } System.out.println(key+","+value.toString()); } } public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException { body1(); } }
要访问数据库,所以要用一个数据库个工具类:
package utils; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; public class datautils { public static final String Driver = "com.mysql.jdbc.Driver"; public static final String url = "jdbc:mysql://localhost:3306/nyb?useSSL=true"; public static final String user = "root"; public static final String password = "123456"; static { try { Class.forName(Driver); } catch (ClassNotFoundException e) { e.printStackTrace(); } } public static Connection getConnection() throws SQLException{ Connection conn = null; conn = DriverManager.getConnection(url,user,password); return conn; } }
相关文章推荐
- java简单实现爬虫、jsoup实现网页抓取、POI实现数据导出Excel
- 一个简单java爬虫爬取网页中邮箱并保存
- crawler4j 爬虫网页数据并保存到数据库中
- 一个简单java爬虫爬取网页中邮箱并保存
- 一个简单java爬虫爬取网页中邮箱并保存
- Nodejs实现简单爬虫,将爬到的数据以json数据格式保存到MySQL数据库中
- Jsoup抓取网页数据完成一个简易的Android新闻APP
- ASP.NET网络爬虫小研究 HtmlAgilityPack基础,爬取数据保存在数据库中再显示再自己的网页中
- 创建一个数据库且保存原来旧数据库数据的快捷方法
- 使用正则表达式写一个网页爬虫案例获取指定文档中的邮件地址保存到自己指定的文件夹中
- 利用Jsoup解析网页,抓取数据的简单应用
- ASP.NET网络爬虫小研究 HtmlAgilityPack基础,爬取数据保存在数据库中再显示再自己的网页中
- 一个简单的java读取网页图片并保存图片的程序
- 将爬取的网页数据保存到数据库时报错不能提交JPA,Caused by: java.sql.SQLException: Incorrect string value: '\xF0\x9F\x98\xB6 \xE2...' for column 'content' at row 1
- 【C#】对异步请求处理程序IHttpAsyncHandler的理解和分享一个易用性封装 【手记】走近科学之为什么明明实现了IEnumerable<T>的类型却不能调用LINQ扩展方法 【手记】手机网页弹出层后屏蔽底层的滑动响应 【手记】ASP.NET提示“未能创建类型”处理 【Web】一个非常简单的移动web消息框 【手记】解决EXCEL跑SQL遇“查询无法运行或数据库表无法打开...”
- Orcale中一个简单的存储过程刷数据库数据
- 一个简单的批量更新oracle 数据库中 最近的服务商名称的数据
- 如何通过jsoup网络爬虫工具爬取网页数据,并通过jxl工具导出到excel
- ASP.NET网络爬虫小研究 HtmlAgilityPack基础,爬取数据保存在数据库中再显示再自己的网页中
- 简单网路爬虫(JSoup) + SSH + Mysql保存...