腾讯视频电影信息爬取(scrapy框架下采用MySQL数据库)
2018-01-30 16:02
501 查看
一、准备
1.cmd 命令行 pip install pymysql,pip install lxml,pip install requests
2.创建scrapy项目并进行MySQL数据库配置
具体配置过程可见转载博客:http://blog.csdn.net/qq_31518899/article/details/76576537 Scrapy连接MySQL数据库
当然若有不太明白的可见本项目具体代码items.py中MysqlConnect类和settings.py代码比较后进行修改操作
二、代码块
1.settings.py
3.tencent_spider.py
4. items.py
1.cmd 命令行 pip install pymysql,pip install lxml,pip install requests
2.创建scrapy项目并进行MySQL数据库配置
具体配置过程可见转载博客:http://blog.csdn.net/qq_31518899/article/details/76576537 Scrapy连接MySQL数据库
当然若有不太明白的可见本项目具体代码items.py中MysqlConnect类和settings.py代码比较后进行修改操作
二、代码块
1.settings.py
# -*- coding: utf-8 -*- # Scrapy settings for Tencent project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'Tencent' SPIDER_MODULES = ['Tencent.spiders'] NEWSPIDER_MODULE = 'Tencent.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Tencent (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY =False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Tencent.middlewares.TencentSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'Tencent.middlewares.TencentDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'Tencent.pipelines.TencentPipeline': 300, #'Tencent.pipelines.MySQLConnectPipeline':305, } #Mysql数据库的配置信息 MYSQL_HOST='127.0.0.1' MYSQL_USER='root' #你自己数据库的密码 MYSQL_PASSWORD='root' MYSQL_PORT =3306 #你自己数据库的名称 MYSQL_DB='test' CHARSET='utf8' # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The init 4000 ial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'2.pipelines.py
# -*- coding: utf-8 -*- import scrapy import xlwt import pymysql from scrapy.conf import settings # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class TencentPipeline(object): def process_item(self, item, spider): item.insert_data(item) return item
3.tencent_spider.py
# -*- coding: utf-8 -*- import scrapy from ..items import TencentItem,CommentItem import re,requests,json class TencentSpiderSpider(scrapy.Spider): name = 'tencent_spider' allowed_domains = ['v.qq.com'] start_urls = ['https://v.qq.com/x/list/movie'] def parse(self, response): category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract() for href in category_part: detail_url='https://v.qq.com/x/list/movie{}'.format(href) yield scrapy.Request(url=detail_url, callback=self.detail_parse ) def detail_parse(self,response): headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \ 'Firefox/53.0'} #分类后的电影信息 movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract() movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract() movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract() score_list=[] total_score=[] #得到处理后的评分列表 for movie_score in movie_scores: if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t': score_list.append(movie_score) #print(score_list) j=0 while j in range(0,len(score_list)-1): score=score_list[j]+score_list[j+1] j += 2 total_score.append(score) #print(total_score) movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量 movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数 #进入电影详情页 for x in range(0,len(movie_links)): #获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】 #然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容 cid=movie_links[x].split('/')[-1]#获取cid cid=cid.split('.')[0] #print(cid) #获取comment_id comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid) html=requests.get(comment_id_url).text pattern=re.compile(r'comment_id":"(.*?)"') comment_id=re.search(pattern,html).group(1) #print(comment_id) #获取评论页内容 comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id) comment_html=requests.get(comment_url,headers=headers).text dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据 data_dict = dict['data'] commentid_list = data_dict['commentid'] if commentid_list:#电影有评论 for detail in commentid_list: comment =CommentItem() comment['movie_title'] = movie_titles[x]#电影名 comment['timeDifference'] = detail['timeDifference']# 发布时间 comment['content'] = detail['content']# 内容 comment['up'] = detail['up']# 点赞 comment['rep'] = detail['rep']# 踩 userinfo_dict = detail['userinfo']# 用户信息(字典) userid = userinfo_dict['userid'] comment['userid']=userid# 用户id comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接 yield comment yield scrapy.Request(url=movie_links[x], callback=self.movie_parse, meta={'movie_link':movie_links[x], 'movie_title':movie_titles[x], 'score':total_score[x], 'movie_playCount':movie_playCounts[x], 'movie_account':movie_account} ) # 下一页 next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('') print(next_pg) if next_pg: next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg) yield scrapy.Request(url=next_url, callback=self.detail_parse ) def movie_parse(self,response): #简介区 abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text(' ')').extract_first('') directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract() director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract() if directors:#存在导演信息 director=directors[0] act=','.join(directors[1:]) director_link=director_links[0] act_link=','.join(director_links[1:]) else: director ='#' act = '#' director_link = '#' act_link = '#' #概览区 movie_title=response.meta['movie_title'] score=response.meta['score'] movie_playCount=response.meta['movie_playCount'] movie_account=response.meta['movie_account'] movie_link=response.meta['movie_link'] movie=TencentItem() #简介 movie['abstract']=abstract movie['director']=director movie['act']=act movie['director_link']=director_link movie['act_link']=act_link #概览 movie['movie_title']=movie_title movie['score']=score movie['movie_playCount']=movie_playCount movie['movie_link']=movie_link movie['movie_account']=movie_account yield movie此为主要代码
4. items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy import pymysql from scrapy.conf import settings class MySQLConnect(scrapy.Item): @staticmethod def mysqlConnect(sql): host = settings['MYSQL_HOSTS'] user = settings['MYSQL_USER'] psd = settings['MYSQL_PASSWORD'] db = settings['MYSQL_DB'] charset = settings['CHARSET'] port = settings['MYSQL_PORT'] # 数据库连接 con = pymysql.connect(host=host, user=user, passwd=psd, db=db, charset=charset, port=port) # 数据库游标 cur = con.cursor() try: cur.execute(sql) print("insert success") # 测试语句 except Exception as e: print('Insert error:', e) con.rollback() else: con.commit() con.close() class TencentItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #简介区 abstract=scrapy.Field() director=scrapy.Field() director_link=scrapy.Field() act=scrapy.Field() act_link=scrapy.Field() #概览区 movie_title=scrapy.Field() score=scrapy.Field() movie_playCount=scrapy.Field() movie_account=scrapy.Field() movie_link=scrapy.Field() def insert_data(self,item): sql1 = "insert into abstract(movieTitle,director,directorLink,act,actLink,abstract)values('%s','%s','%s','%s','%s','%s');" % (item['movie_title'], item['director'], item['director_link'], item['act'], item['act_link'],item['abstract']) print('TencentItem insert.....................') MySQLConnect.mysqlConnect(sql1) sql3 = "insert into overview(movieTitle,score,playCount,link)values('%s','%s','%s','%s');" % (item['movie_title'], item['score'], item['movie_playCount'], item['movie_link']) MySQLConnect.mysqlConnect(sql3) class CommentItem(scrapy.Item): # 评论区 movie_title = scrapy.Field() timeDifference = scrapy.Field() content = scrapy.Field() up = scrapy.Field() rep = scrapy.Field() userLink = scrapy.Field() userid = scrapy.Field() def insert_data(self,item): sql2 = "insert into comment(userID,userLink,timeDiffrence,content,praise,ref,movieTitle)values('%s','%s','%s','%s','%s','%s','%s');" % (item['userid'], item['userLink'], item['timeDifference'], item['content'], item['up'],item['rep'], item['movie_title']) print('CommentItem insert................') MySQLConnect.mysqlConnect(sql2)具体项目可见码云:https://gitee.com/YunZhiBiDuan3555/TengXunShiPinWangYeBanDianYingXinXiPaQu.git
相关文章推荐
- scrapy框架学习-爬取腾讯社招信息-tencent.py
- P_010.~慢慢悠悠~使用Python的Scrapy框架成功爬取豆瓣电影的全部信息
- scrapy框架学习-爬取腾讯社招信息-item字段和管道文件
- Scrapy框架学习 - 爬取腾讯社招全部职位信息
- Scrapy框架学习 - 爬取豆瓣电影排行榜TOP250所有电影信息并保存到MongoDB数据库中
- scrapy框架学习-爬取腾讯社招信息-部分运行结果
- Python 采用Scrapy爬虫框架爬取豆瓣电影top250
- scrapy框架爬取知乎110w用户信息,并存入mysql数据库和mongoDB数据库
- 利用Scrapy框架爬取博客信息并存到mysql数据库
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- 在MyEclipse环境下将采用SSH2框架的项目从连接到MySQL数据库改成连接到Oracle数据库
- scrapy框架爬虫将数据保存到MySQL数据库(20170214)
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- python 爬虫学习三(Scrapy 实战,豆瓣爬取电影信息)
- 【项目实战】使用Scrapy爬取商品信息并写入MySQL数据库
- scrapy爬虫框架教程(二)-- 爬取豆瓣电影TOP250
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- Scrapy:Python3版本上安装数据挖掘必备的scrapy框架详细攻略(二最完整爬取网页内容信息攻略)——Jason niu
- SpringCloud(第 044 篇)链接Mysql数据库简单的集成Mybatis框架采用MapperXml访问数据库
- 用scrapy框架爬取豆瓣Top250电影