scrapy爬取智联招聘中python工程师的招聘信息
2016-04-06 19:29
639 查看
# -*- coding:utf-8 -*- ''' import sys import codecs import chardet import scrapy from scrapy import log from scrapy.selector import Selector from tutorial.items import DmozItem ''' import pymongo import scrapy from scrapy.http import Request #import sys import codecs import re #from tutorial.items import DmozItem class DmozSpider(scrapy.Spider): name = "dmoz" allowed_domains = ["zhaopin.com"] start_urls = ["http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python&sm=0&sg=3e5d326a89a44a8594d2904a3a207b9e&p=1"] def parse2(self,response): f = codecs.open(r'F:\test\tutorial\a','a','utf-8') #职位待遇 try: pay_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/ul[@class="terminal-ul clearfix"]/li') pay = '' for sel in pay_data: pay_middle_data = sel.xpath('strong').extract() pay = pay + re.sub(r'<.*?>','',pay_middle_data[0]) + ' ' f.write(pay+'\n') except: pass #任职要求 try: require_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[1]/p').extract() require_data_middle = '' for i in require_data: i_middle = re.sub(r'<.*?>',r'',i,re.S) require_data_middle = require_data_middle + re.sub(r'\s*',r'',i_middle,re.S) f.write(require_data_middle+'\n') except: pass #公司地址 try: company_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/h2').extract() company_data = re.search(r'<h2>\s*(.*?)\s*<a',company_data[0],re.S).group(1) f.write(company_data+'\n') except: pass #招聘信息地址 company_url = str(response)[5:-1] f.write(company_url+'\n\n') f.close() conn = pymongo.MongoClient('192.168.1.5',27017) db = conn.test employee = db.employee employee.insert({"url":company_url,"pay":pay,"require_data":require_data_middle,"company_data":company_data}) def parse(self,response): #获取本页职位链接 url_data = response.xpath('//div[@id="newlist_list_content_table"]//td[@class="zwmc"]//a/@href').extract() # f = codecs.open(r'F:\test\tutorial\a','a','utf-8') # for company_url in url_data: # f.write(company_url+'\n') # f.close() for company_url in url_data: yield Request(company_url,callback=self.parse2) #跳转到下一页 next_page_url = response.xpath('//body/div[@class="main"]/div[@class="search_newlist_main"]/div[@class="newlist_main"]/form[@name="frmMain"]/div[@class="clearfix"]/div[@class="newlist_wrap fl"]/div[@class="pagesDown"]/ul/li[@class="pagesDown-pos"]/a/@href').extract() yield Request(next_page_url[0],callback=self.parse) ''' def parse(self,response): f = codecs.open(r'F:\test\tutorial\a','a','utf-8') #职位待遇 pay_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/ul[@class="terminal-ul clearfix"]/li') pay = '' for sel in pay_data: try: pay_middle_data = sel.xpath('strong').extract() pay = pay + re.sub(r'<.*?>','',pay_middle_data[0]) + ' ' except: pass f.write(pay+'\n') #任职要求 require_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[1]/p').extract() require_data_middle = '' for i in require_data: i_middle = re.sub(r'<.*?>',r'',i,re.S) require_data_middle = require_data_middle + re.sub(r'\s*',r'',i_middle,re.S) f.write(require_data_middle+'\n') #公司地址 company_data = response.xpath('//body/div[@class="terminalpage clearfix"]/div[@class="terminalpage-left"]/div[@class="terminalpage-main clearfix"]/div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]/h2').extract() company_data = re.search(r'<h2>\s*(.*?)\s*<a',company_data[0],re.S).group(1) f.write(company_data+'\n') #招聘信息地址 company_url = str(response)[5:-1] f.write(company_url+'\n\n') f.close() '''
相关文章推荐
- Python3写爬虫(四)多线程实现数据爬取
- install and upgrade scrapy
- install scrapy with pip and easy_install
- Scrapy的架构介绍
- 爬虫笔记
- 基于C#实现网页爬虫
- Nodejs爬虫进阶教程之异步并发控制
- PHP+HTML+JavaScript+Css实现简单爬虫开发
- python使用scrapy解析js示例
- 如何优雅地使用c语言编写爬虫
- Python基于scrapy采集数据时使用代理服务器的方法
- PHP实现简单爬虫的方法
- NodeJS制作爬虫全过程(续)
- php实现简单爬虫的开发
- node.js基础模块http、网页分析工具cherrio实现爬虫
- PHP爬虫之百万级别知乎用户数据爬取与分析
- 一个PHP实现的轻量级简单爬虫
- nodejs爬虫抓取数据乱码问题总结
- 基于Node.js的强大爬虫 能直接发布抓取的文章哦