您的位置:首页 > 编程语言 > Python开发

Python爬虫-爬取51job.com 招聘信息并写入文件和数据库mysql

2017-10-17 09:10 861 查看
项目代码地址: ===欢迎fork 、star ===

https://github.com/kangvcar/pyproject/blob/master/Spiders/Spider_51Job.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-13 11:40:41
# @Author  : kangvcar (kangvcar@126.com)
# @Link    : http://www.github.com/kangvcar/ # @Version : $Id$

import urllib2
from bs4 import BeautifulSoup
import re
import MySQLdb

class Item(object):
'''定义Item类'''
JobName = None
CompanyName = None
WorkPlace = None
Salary = None
Time = None

class getJobInfo(object):
"""get www.51job.com Info"""
def __init__(self, job):
self.job = job
self.baseurl = 'http://search.51job.com/list/030200,000000,0000,00,9,99,'
self.url = self.baseurl + self.job + ',2,1.html'
self.firstPage = self.getPage(self.url)
self.urls = self.getUrls(self.firstPage)
self.items = self.spider(self.urls)
# self.pipelines2file(self.items)
self.pipelines2mysql(self.items)

def getPage(self, url):
''' 获取网页源代码 '''
response = urllib2.urlopen(url)
html = response.read()
return html

def getUrls(self, firstPage):
''' 获取urls列表 '''
s = '共(.*?)页'.decode('utf-8')
defPage = re.compile(s)
# print self.firstPage.decode('gbk')
fullPage = re.search(defPage, self.firstPage.decode('gbk')).group(1)
# print fullPage
# fullPage = 65
urls = []
ul = self.url.split(',')
for page in range(1, int(fullPage)+1):
ul[-1] = str(page) + '.html'
url = ','.join(ul)
urls.append(url)
return urls

def spider(self, urls):
''' 爬取item类定义的信息'''
items = []
for url in urls:
html = self.getPage(url)
soup = BeautifulSoup(html, 'lxml')
divs1 = soup.find('div', attrs={'id':'resultList'})
divlists = divs1.find_all('div', attrs={'class': 'el'})
for div in divlists:
try:
item = Item()
print unicode(div.find('p', attrs={'class':'t1'}).get_text().strip())
item.JobName = unicode(div.find('p', attrs={'class':'t1'}).get_text().strip())
print unicode(div.find('span', attrs={'class':'t2'}).a.get_text())
item.CompanyName = unicode(div.find('span', attrs={'class':'t2'}).a.get_text())
print unicode(div.find('span', attrs={'class':'t3'}).get_text())
item.WorkPlace = unicode(div.find('span', attrs={'class':'t3'}).get_text())
print unicode(div.find('span', attrs={'class':'t4'}).get_text())
item.Salary = unicode(div.find('span', attrs={'class':'t4'}).get_text())
print unicode(div.find('span', attrs={'class':'t5'}).get_text())
item.Time = unicode(div.find('span', attrs={'class':'t5'}).get_text())
except:
pass
continue
else:
items.append(item)
return items

def pipelines2file(self, items):
''' 把爬取到的数据存储到文件'''
# Job = self.url.split(',')[-3]
fileName = self.job + u'的招聘信息.txt'
# fileName = u'51Job招聘信息.txt'
with open(fileName, 'w') as fp:
fp.write('%-35s \t| %-30s \t| %-20s \t| %20s \t| %20s \n' %('职位名', '公司名', '工作地点', '薪资', '发布时间'))
for item in items:
fp.write('%-40s \t| %-40s \t| %-20s \t| %20s \t| %20s \n' %(item.JobName.encode('utf-8'), item.CompanyName.encode('utf-8'), item.WorkPlace.encode('utf-8'), item.Salary.encode('utf-8'), item.Time.encode('utf-8')))

def pipelines2mysql(self, items):
''' 把爬取到的数据存储到数据库'''
conn = MySQLdb.connect(
host='192.168.10.10',
port=3306,
user='crawl123',
passwd='crawl123',
db='scrapyDB',
charset = 'utf8')
cur = conn.cursor()
for item in items:
JobName = item.JobName.encode('utf-8')
CompanyName = item.CompanyName.encode('utf-8')
WorkPlace = item.WorkPlace.encode('utf-8')
Salary = item.Salary.encode('utf-8')
Time1 = item.Time.encode('utf-8')
cur.execute("INSERT INTO pythonjobs(JobName, CompanyName, WorkPlace, Salary, Time1) values(%s,%s,%s,%s,%s)", (JobName,CompanyName,WorkPlace,Salary,Time1))
cur.close()
conn.commit()
conn.close()

JI = getJobInfo('python')

#创建数据表命令
#create database scrapyDB character set 'utf8' collate 'utf8_general_Ci'
#create table jobs( id int auto_increment, JobName char(60), CompanyName char(60), WorkPlace char(60), Salary char(60), Time1 char(60), primary key(id))engine=InnoDB DEFAULT CHARSET=utf8;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Python 爬虫 51job mysql
相关文章推荐