4000 使用scrapy对大学生互联网+科技创业大赛的爬取
2017-10-14 15:56
656 查看
最近对cy.ncss.org.cn/这个网站的项目部分进行了爬取,首先对这个网站分析,发现我们需要的数据在http://cy.ncss.org.cn/search/projectlist?name=&industryCode=&typeCode=&wasBindUniTechnology=-9&investStageCode=&provinceCode=&pageIndex=0&pageSize=15这个url上,大量网页的存在,我需要对这个网站进行分页操作,简单实用了切片等操作实现翻页功能。
下面是spider的代码
`# -- coding: utf-8 --
import scrapy
from internetplus.items import InternetplusItem
class XmSpiderSpider(scrapy.Spider):
name = “xm_spider”
allowed_domains = [“cy.ncss.org.cn/”]
start_urls = [‘http://cy.ncss.org.cn/search/projectlist?name=&industryCode=&typeCode=&wasBindUniTechnology=-9&investStageCode=&provinceCode=&pageIndex=0&pageSize=15‘]
这边是items的编写
Pipeline的代码
最后settings的编写
最后是爬取到的数据
下面是spider的代码
`# -- coding: utf-8 --
import scrapy
from internetplus.items import InternetplusItem
class XmSpiderSpider(scrapy.Spider):
name = “xm_spider”
allowed_domains = [“cy.ncss.org.cn/”]
start_urls = [‘http://cy.ncss.org.cn/search/projectlist?name=&industryCode=&typeCode=&wasBindUniTechnology=-9&investStageCode=&provinceCode=&pageIndex=0&pageSize=15‘]
def parse(self, response): #获取项目的url eve_href = response.xpath('//div[@class="search-list-item"]') for in_href in eve_href : # '''/html/body/div[1]/div[2]/a /html/body/div[1]''' # # print(in_href ) # a = in_href.xpath('div[@class="project-list-info"]/a/@href').extract()[0] # print(a) href = 'http://cy.ncss.org.cn'+in_href.xpath('div[2]/a/@href').extract()[0] # print(href) yield scrapy.Request(url= href,callback=self.content_page,dont_filter= True) #翻页 list_url = response.url page_num = int(list_url.split("pageIndex=")[1].split("&")[0]) print(page_num ) if page_num < 17328: new_page_num = page_num + 1 page_str = "pageIndex=" + str(page_num ) new_page_str = "pageIndex=" + str(new_page_num) next_page_url = str(list_url).replace(page_str ,new_page_str ) # print(next_page_url) yield scrapy.Request(url=next_page_url, callback=self.parse,dont_filter= True) def content_page(self,response): items = InternetplusItem() items['title'] = response.xpath('//div[@class="project-title"]/h1/text()').extract() items['tags'] = response.xpath('//span[@class="tag-text"]//text()').extract() items['types'] = response.xpath('//p[@class="project-label"]/span/text()').extract() items['address'] = response.xpath('//p[@class="project-label mt10"]/span/text()').extract() # items['content'] = response.xpath('//div[@class="project-block-content pb10"]/div[1]/p/text()').extract() items['now'] = response.xpath('//div[@class=" "]/div[2]/p/text()').extract() yield items # print(item['title']) # print(response.text )
这边是items的编写
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class InternetplusItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() tags = scrapy.Field() types = scrapy.Field() address = scrapy.Field() now = scrapy.Field()
Pipeline的代码
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo from scrapy.conf import settings class InternetplusPipeline(object): def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbName = settings['MONGODB_DBNAME'] client = pymongo.MongoClient(host=host, port=port) tdb = client[dbName] self.post = tdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): dcwzj = dict(item) self.post.insert(dcwzj) return item
最后settings的编写
# -*- coding: utf-8 -*- # Scrapy settings for internetplus project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'internetplus' SPIDER_MODULES = ['internetplus.spiders'] NEWSPIDER_MODULE = 'internetplus.spiders' MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = 'dcw' MONGODB_DOCNAME = 'zj' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'internetplus (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'internetplus.middlewares.InternetplusSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'internetplus.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'internetplus.pipelines.InternetplusPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
最后是爬取到的数据
相关文章推荐
- 河南省“互联网+ ”开放合作大签约850亿元“互联网+”高峰论坛、河南省县域电商峰会、腾讯“互联网+”中国行河南站活动,以及阿里—中原云大数据应用创意大赛、腾讯河南智慧旅游、河南省互联网创新创业大赛
- 科技前沿:互联网下的AI创业浪潮,Google来华搅局?
- 预告 | 首届中国轨道交通科技创新创业大赛清华站复赛晋级名单揭晓
- 大学生扎推互联网创业后的生死录
- 在家庭教育中,孩子使用 ICT(互联网/电脑/科技产品)的利与弊
- 首届城市轨道交通科技创新创业大赛华南赛区28日即将启动
- 东莞高校运营者分享交流会第二站城院站。主办方:大学生创新创业发展中心、东莞市高校互联网创业联盟。协办:城院捕快时间:2015年6月11日下午2点半地点:爱艺术咖啡厅。如果你想做一个东莞本地出色的自媒体
- 绵阳市首届青年科技创新创业大赛揭幕
- 大学生到底适不适合创业 来自互联网大佬的指点
- “汇新杯”新兴科技+互联网创新大赛
- 两位大学生的互联网创业梦
- 首届中国城市轨道交通科技创新创业大赛清华站复赛成功举行
- 南方周末:两位大学生的互联网创业梦(转载)
- 首届中国城市轨道交通科技创新创业大赛-清华站今日正式启动!
- 调研《构建之法》指导下的中国“互联网+”大学生创新创业大赛作品
- 互联网创业团队都使用些什么工具?
- 第五届江苏科技创业大赛暨第六届中国创新创业大赛江苏赛区正式启动
- 4000 欢迎使用CSDN-markdown编辑器
- 后互联网创业时代
- 传统行业人士怎么看待互联网创业?