您的位置：首页 > 数据库 > MySQL

腾讯视频电影信息爬取（scrapy框架下采用MySQL数据库）

2018-01-30 16:02 501 查看

一、准备

1.cmd 命令行 pip install pymysql，pip install lxml,pip install requests

2.创建scrapy项目并进行MySQL数据库配置

具体配置过程可见转载博客：http://blog.csdn.net/qq_31518899/article/details/76576537 Scrapy连接MySQL数据库

当然若有不太明白的可见本项目具体代码items.py中MysqlConnect类和settings.py代码比较后进行修改操作

二、代码块

1.settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for Tencent project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html 
BOT_NAME = 'Tencent'

SPIDER_MODULES = ['Tencent.spiders']
NEWSPIDER_MODULE = 'Tencent.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Tencent (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY =False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = {
#    'Tencent.middlewares.TencentSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = {
#    'Tencent.middlewares.TencentDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {
'Tencent.pipelines.TencentPipeline': 300,
#'Tencent.pipelines.MySQLConnectPipeline':305,
}

#Mysql数据库的配置信息
MYSQL_HOST='127.0.0.1'
MYSQL_USER='root'
#你自己数据库的密码
MYSQL_PASSWORD='root'
MYSQL_PORT =3306
#你自己数据库的名称
MYSQL_DB='test'
CHARSET='utf8'

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True
# The init
4000
ial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2.pipelines.py

# -*- coding: utf-8 -*-
import scrapy
import xlwt
import pymysql
from scrapy.conf import settings
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 
class TencentPipeline(object):

def process_item(self, item, spider):
item.insert_data(item)
return item

3.tencent_spider.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import TencentItem,CommentItem
import re,requests,json

class TencentSpiderSpider(scrapy.Spider):
name = 'tencent_spider'
allowed_domains = ['v.qq.com']
start_urls = ['https://v.qq.com/x/list/movie']

def parse(self, response):
category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract()
for href in category_part:
detail_url='https://v.qq.com/x/list/movie{}'.format(href)
yield scrapy.Request(url=detail_url,
callback=self.detail_parse
)
def detail_parse(self,response):
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \
'Firefox/53.0'}
#分类后的电影信息
movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract()
movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract()
movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract()
score_list=[]
total_score=[]
#得到处理后的评分列表
for movie_score in movie_scores:
if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t':
score_list.append(movie_score)
#print(score_list)
j=0
while j in range(0,len(score_list)-1):
score=score_list[j]+score_list[j+1]
j += 2
total_score.append(score)
#print(total_score)
movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量
movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数
#进入电影详情页
for x in range(0,len(movie_links)):
#获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】
#然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容
cid=movie_links[x].split('/')[-1]#获取cid
cid=cid.split('.')[0]
#print(cid)
#获取comment_id
comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid)
html=requests.get(comment_id_url).text
pattern=re.compile(r'comment_id":"(.*?)"')
comment_id=re.search(pattern,html).group(1)
#print(comment_id)
#获取评论页内容
comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id)
comment_html=requests.get(comment_url,headers=headers).text
dict=json.loads(comment_html)#获得json数据，并通过解析取出需要数据
data_dict = dict['data']
commentid_list = data_dict['commentid']
if commentid_list:#电影有评论
for detail in commentid_list:
comment =CommentItem()
comment['movie_title'] = movie_titles[x]#电影名
comment['timeDifference'] = detail['timeDifference']# 发布时间
comment['content'] = detail['content']# 内容
comment['up'] = detail['up']# 点赞
comment['rep'] = detail['rep']# 踩
userinfo_dict = detail['userinfo']# 用户信息（字典）
userid = userinfo_dict['userid']
comment['userid']=userid# 用户id
comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接
yield comment

yield  scrapy.Request(url=movie_links[x],
callback=self.movie_parse,
meta={'movie_link':movie_links[x],
'movie_title':movie_titles[x],
'score':total_score[x],
'movie_playCount':movie_playCounts[x],
'movie_account':movie_account}
)
# 下一页
next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('')
print(next_pg)
if next_pg:
next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg)
yield scrapy.Request(url=next_url,
callback=self.detail_parse
)
def movie_parse(self,response):

#简介区
abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text('
')').extract_first('')
directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract()
director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract()
if directors:#存在导演信息
director=directors[0]
act=','.join(directors[1:])
director_link=director_links[0]
act_link=','.join(director_links[1:])
else:
director ='#'
act = '#'
director_link = '#'
act_link = '#'
#概览区
movie_title=response.meta['movie_title']
score=response.meta['score']
movie_playCount=response.meta['movie_playCount']
movie_account=response.meta['movie_account']
movie_link=response.meta['movie_link']

movie=TencentItem()
#简介
movie['abstract']=abstract
movie['director']=director
movie['act']=act
movie['director_link']=director_link
movie['act_link']=act_link
#概览
movie['movie_title']=movie_title
movie['score']=score
movie['movie_playCount']=movie_playCount
movie['movie_link']=movie_link
movie['movie_account']=movie_account
yield movie

此为主要代码

4. items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html 
import scrapy
import pymysql
from scrapy.conf import settings

class MySQLConnect(scrapy.Item):
@staticmethod
def mysqlConnect(sql):
host = settings['MYSQL_HOSTS']
user = settings['MYSQL_USER']
psd = settings['MYSQL_PASSWORD']
db = settings['MYSQL_DB']
charset = settings['CHARSET']
port = settings['MYSQL_PORT']
# 数据库连接
con = pymysql.connect(host=host, user=user, passwd=psd, db=db, charset=charset, port=port)
# 数据库游标
cur = con.cursor()
try:
cur.execute(sql)
print("insert success")  # 测试语句
except Exception as e:
print('Insert error:', e)
con.rollback()
else:
con.commit()
con.close()

class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#简介区
abstract=scrapy.Field()
director=scrapy.Field()
director_link=scrapy.Field()
act=scrapy.Field()
act_link=scrapy.Field()

#概览区
movie_title=scrapy.Field()
score=scrapy.Field()
movie_playCount=scrapy.Field()
movie_account=scrapy.Field()
movie_link=scrapy.Field()

def insert_data(self,item):
sql1 = "insert into abstract(movieTitle,director,directorLink,act,actLink,abstract)values('%s','%s','%s','%s','%s','%s');" % (item['movie_title'], item['director'], item['director_link'], item['act'], item['act_link'],item['abstract'])
print('TencentItem insert.....................')
MySQLConnect.mysqlConnect(sql1)
sql3 = "insert into overview(movieTitle,score,playCount,link)values('%s','%s','%s','%s');" % (item['movie_title'], item['score'], item['movie_playCount'], item['movie_link'])
MySQLConnect.mysqlConnect(sql3)
class CommentItem(scrapy.Item):
# 评论区
movie_title = scrapy.Field()
timeDifference = scrapy.Field()
content = scrapy.Field()
up = scrapy.Field()
rep = scrapy.Field()
userLink = scrapy.Field()
userid = scrapy.Field()
def insert_data(self,item):
sql2 = "insert into comment(userID,userLink,timeDiffrence,content,praise,ref,movieTitle)values('%s','%s','%s','%s','%s','%s','%s');" % (item['userid'], item['userLink'], item['timeDifference'], item['content'], item['up'],item['rep'], item['movie_title'])
print('CommentItem insert................')
MySQLConnect.mysqlConnect(sql2)

具体项目可见码云：https://gitee.com/YunZhiBiDuan3555/TengXunShiPinWangYeBanDianYingXinXiPaQu.git

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 正则 xpath json mysql scrapy

相关文章推荐

新的分享

章节导航