Python爬虫-爬取51job.com 招聘信息并写入文件和数据库mysql
2017-10-17 09:10
861 查看
项目代码地址: ===欢迎fork 、star ===
https://github.com/kangvcar/pyproject/blob/master/Spiders/Spider_51Job.py
https://github.com/kangvcar/pyproject/blob/master/Spiders/Spider_51Job.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2017-10-13 11:40:41 # @Author : kangvcar (kangvcar@126.com) # @Link : http://www.github.com/kangvcar/ # @Version : $Id$ import urllib2 from bs4 import BeautifulSoup import re import MySQLdb class Item(object): '''定义Item类''' JobName = None CompanyName = None WorkPlace = None Salary = None Time = None class getJobInfo(object): """get www.51job.com Info""" def __init__(self, job): self.job = job self.baseurl = 'http://search.51job.com/list/030200,000000,0000,00,9,99,' self.url = self.baseurl + self.job + ',2,1.html' self.firstPage = self.getPage(self.url) self.urls = self.getUrls(self.firstPage) self.items = self.spider(self.urls) # self.pipelines2file(self.items) self.pipelines2mysql(self.items) def getPage(self, url): ''' 获取网页源代码 ''' response = urllib2.urlopen(url) html = response.read() return html def getUrls(self, firstPage): ''' 获取urls列表 ''' s = '共(.*?)页'.decode('utf-8') defPage = re.compile(s) # print self.firstPage.decode('gbk') fullPage = re.search(defPage, self.firstPage.decode('gbk')).group(1) # print fullPage # fullPage = 65 urls = [] ul = self.url.split(',') for page in range(1, int(fullPage)+1): ul[-1] = str(page) + '.html' url = ','.join(ul) urls.append(url) return urls def spider(self, urls): ''' 爬取item类定义的信息''' items = [] for url in urls: html = self.getPage(url) soup = BeautifulSoup(html, 'lxml') divs1 = soup.find('div', attrs={'id':'resultList'}) divlists = divs1.find_all('div', attrs={'class': 'el'}) for div in divlists: try: item = Item() print unicode(div.find('p', attrs={'class':'t1'}).get_text().strip()) item.JobName = unicode(div.find('p', attrs={'class':'t1'}).get_text().strip()) print unicode(div.find('span', attrs={'class':'t2'}).a.get_text()) item.CompanyName = unicode(div.find('span', attrs={'class':'t2'}).a.get_text()) print unicode(div.find('span', attrs={'class':'t3'}).get_text()) item.WorkPlace = unicode(div.find('span', attrs={'class':'t3'}).get_text()) print unicode(div.find('span', attrs={'class':'t4'}).get_text()) item.Salary = unicode(div.find('span', attrs={'class':'t4'}).get_text()) print unicode(div.find('span', attrs={'class':'t5'}).get_text()) item.Time = unicode(div.find('span', attrs={'class':'t5'}).get_text()) except: pass continue else: items.append(item) return items def pipelines2file(self, items): ''' 把爬取到的数据存储到文件''' # Job = self.url.split(',')[-3] fileName = self.job + u'的招聘信息.txt' # fileName = u'51Job招聘信息.txt' with open(fileName, 'w') as fp: fp.write('%-35s \t| %-30s \t| %-20s \t| %20s \t| %20s \n' %('职位名', '公司名', '工作地点', '薪资', '发布时间')) for item in items: fp.write('%-40s \t| %-40s \t| %-20s \t| %20s \t| %20s \n' %(item.JobName.encode('utf-8'), item.CompanyName.encode('utf-8'), item.WorkPlace.encode('utf-8'), item.Salary.encode('utf-8'), item.Time.encode('utf-8'))) def pipelines2mysql(self, items): ''' 把爬取到的数据存储到数据库''' conn = MySQLdb.connect( host='192.168.10.10', port=3306, user='crawl123', passwd='crawl123', db='scrapyDB', charset = 'utf8') cur = conn.cursor() for item in items: JobName = item.JobName.encode('utf-8') CompanyName = item.CompanyName.encode('utf-8') WorkPlace = item.WorkPlace.encode('utf-8') Salary = item.Salary.encode('utf-8') Time1 = item.Time.encode('utf-8') cur.execute("INSERT INTO pythonjobs(JobName, CompanyName, WorkPlace, Salary, Time1) values(%s,%s,%s,%s,%s)", (JobName,CompanyName,WorkPlace,Salary,Time1)) cur.close() conn.commit() conn.close() JI = getJobInfo('python') #创建数据表命令 #create database scrapyDB character set 'utf8' collate 'utf8_general_Ci' #create table jobs( id int auto_increment, JobName char(60), CompanyName char(60), WorkPlace char(60), Salary char(60), Time1 char(60), primary key(id))engine=InnoDB DEFAULT CHARSET=utf8;
相关文章推荐
- Python爬虫-爬取xixizhan.com站点的所有电影列表并写入文件和数据库mysql
- python简单应用!用爬虫来采集天猫所有优惠券信息,写入本地文件
- Python爬虫小实践:寻找失踪人口,爬取失踪儿童信息并写成csv文件,方便存入数据库
- Python爬虫-爬取 ygdy8.com 站点的所有电影并写入文件和数据库
- Python爬虫小实践:寻找失踪人口,爬取失踪儿童信息并写成csv文件,方便存入数据库
- Python爬虫小实践:寻找失踪人口,爬取失踪儿童信息并写成csv文件,方便存入数据库
- Python爬虫-爬取集思录的金融信息,并写入文件和检测数据变化发送邮件通知
- [python爬虫] 招聘信息定时系统 (一).BeautifulSoup爬取信息并存储MySQL
- vc2008: 控制台程序将数据写入MySql(数据库信息从配置文件读取)
- [python爬虫] 招聘信息定时系统 (二).调用pyinstaller包生成exe文件
- Python爬虫框架Scrapy实战之批量抓取招聘信息
- 自学.net(3)把数据库连接信息写入App.config配置文件
- [python和大数据-1]利用爬虫登录知乎进行BFS搜索抓取用户信息本地mysql分析【PART1】
- 【MySQL.Connector】Python下读取数据库信息
- php mysql 数据库写入与读取取文件
- MySQL学习----使用 MySQL 数据库和表----05获得数据库和表的信息及数据库物理物理文件简介
- python实现搜索本地文件信息写入文件的方法
- Python MYSQL - tiny ETL tool - 文件操作和数据库操作
- Python豆瓣爬虫,指定文件行数写入到文件中
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息