您的位置：首页 > 业界新闻

4000 使用scrapy对大学生互联网+科技创业大赛的爬取

2017-10-14 15:56 656 查看

最近对cy.ncss.org.cn/这个网站的项目部分进行了爬取，首先对这个网站分析，发现我们需要的数据在http://cy.ncss.org.cn/search/projectlist?name=&industryCode=&typeCode=&wasBindUniTechnology=-9&investStageCode=&provinceCode=&pageIndex=0&pageSize=15这个url上，大量网页的存在，我需要对这个网站进行分页操作，简单实用了切片等操作实现翻页功能。

下面是spider的代码

`# -- coding: utf-8 --

import scrapy

from internetplus.items import InternetplusItem

class XmSpiderSpider(scrapy.Spider):

name = “xm_spider”

allowed_domains = [“cy.ncss.org.cn/”]

start_urls = [‘http://cy.ncss.org.cn/search/projectlist?name=&industryCode=&typeCode=&wasBindUniTechnology=-9&investStageCode=&provinceCode=&pageIndex=0&pageSize=15‘]

def parse(self, response):
#获取项目的url
eve_href = response.xpath('//div[@class="search-list-item"]')
for in_href in eve_href :
# '''/html/body/div[1]/div[2]/a  /html/body/div[1]'''
# # print(in_href )
# a = in_href.xpath('div[@class="project-list-info"]/a/@href').extract()[0]
# print(a)
href = 'http://cy.ncss.org.cn'+in_href.xpath('div[2]/a/@href').extract()[0]
# print(href)
yield  scrapy.Request(url= href,callback=self.content_page,dont_filter= True)
#翻页
list_url = response.url
page_num = int(list_url.split("pageIndex=")[1].split("&")[0])
print(page_num )
if  page_num < 17328:
new_page_num = page_num + 1
page_str = "pageIndex=" + str(page_num )
new_page_str = "pageIndex=" + str(new_page_num)
next_page_url = str(list_url).replace(page_str ,new_page_str )
# print(next_page_url)
yield  scrapy.Request(url=next_page_url, callback=self.parse,dont_filter= True)

def content_page(self,response):
items = InternetplusItem()
items['title'] = response.xpath('//div[@class="project-title"]/h1/text()').extract()
items['tags'] = response.xpath('//span[@class="tag-text"]//text()').extract()
items['types'] = response.xpath('//p[@class="project-label"]/span/text()').extract()
items['address'] = response.xpath('//p[@class="project-label mt10"]/span/text()').extract()
# items['content'] = response.xpath('//div[@class="project-block-content pb10"]/div[1]/p/text()').extract()
items['now'] = response.xpath('//div[@class=" "]/div[2]/p/text()').extract()
yield items

# print(item['title'])
# print(response.text )

这边是items的编写

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html 
import scrapy

class InternetplusItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
tags = scrapy.Field()
types = scrapy.Field()
address = scrapy.Field()

now = scrapy.Field()

Pipeline的代码

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo
from scrapy.conf import settings

class InternetplusPipeline(object):

def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]

def process_item(self, item, spider):
dcwzj = dict(item)
self.post.insert(dcwzj)
return item

最后settings的编写

# -*- coding: utf-8 -*-

# Scrapy settings for internetplus project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 
BOT_NAME = 'internetplus'

SPIDER_MODULES = ['internetplus.spiders']
NEWSPIDER_MODULE = 'internetplus.spiders'

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'dcw'
MONGODB_DOCNAME = 'zj'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'internetplus (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = {
#    'internetplus.middlewares.InternetplusSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = {
#    'internetplus.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {
'internetplus.pipelines.InternetplusPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

最后是爬取到的数据

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航