pyspider—爬取视频链接
2015-09-30 14:52
316 查看
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2015-03-20 09:46:20 # Project: fly_spider import re import time #from pyspider.database.mysql.mysqldb import SQL from pyspider.libs.base_handler import * from pyquery import PyQuery as pq class Handler(BaseHandler): headers= { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36" } crawl_config = { "headers" : headers, "timeout" : 100 } @every(minutes= 1) def on_start(self): self.crawl('http://www.zhanqi.tv/games',callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): print(response) for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items(): if re.match("http://www.zhanqi.tv/games/\w+", each.attr.href, re.U): self.crawl(each.attr.href, fetch_type='js', js_script=""" function() { setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000); } """,callback=self.list_page) @config(age=1*60*60, priority=2) def list_page(self, response): for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items(): if re.match("http://www.zhanqi.tv/\w+", each.attr.href, re.U): self.crawl(each.attr.href, fetch_type='js', js_script=""" function() { setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000); } """,callback=self.detail_page) @config(age=1*60*60, priority=2) def detail_page(self, response): for each in response.doc('.video-flash-cont').items(): d = pq(each) print(d.html()) return { "url": response.url, "author":response.doc('.meat > span').text(), "title":response.doc('.title-name').text(), "game-name":response.doc('span > .game-name').text(), "users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(), "flash-cont":d.html(), "picture":response.doc('.active > img').text(), }
相关文章推荐
- MySQL 如何只导出 指定的表 的表结构和数据 ( 转 )
- CF 581F 树形dp
- iOS8中使用CoreLocation定位
- 做简单的集群SSH免密码配置(只需要在1台机器操作!只需要2步!)
- activity生命周期
- mysql数据库管理工具sqlyog在首选项里可以设置默认查询分页条数和字体,改写关键字大小写
- Demo Nec
- ubuntu 用户sudo组不小心被删除解决方案
- 解决程序提示“应用程序发生异常 未知的软件异常(0x0eedfade),位置为 0x7c812fd3”
- Ajax与HTML5 history pushState/replaceState实例
- 基于注解的 Spring MVC 简单入门
- 简单自动补全(js+css)
- pyspider—爬取下载图片
- 数据结构广义表实验
- [Error]Win8安装程序出现2502、2503错误解决方法
- .net学习笔记---lambda表达式(自执行方法)
- JAVA8函数式编程
- 浅析nginx刚刚发布的JavaScript能力nginScript
- linux下dns设置详解
- Active Learning 主动学习