您的位置:首页 > 编程语言 > Python开发

python(5) 获取acfun弹幕,评论和视频信息

2016-03-15 10:44 411 查看
每天一点linux命令:新建文件夹



一,使用python获得acfun的所有番剧的信息,评论,弹幕

#! /usr/bin/env python
# -*- coding=utf-8 -*-
import re
import requests
import sys
import json
reload(sys)
sys.setdefaultencoding("utf-8")
num = 1
head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'} #防陷阱
def dm(ht):
oldURL= 'http://danmu.aixifan.com/V2/' + ht + '?pageSize=500&pageNo=0'
#print oldURL
for i in range(1,5):
newURL = re.sub('pageNo=\d+','pageNo=%d'%i,oldURL,re.S)
print newURL
html = requests.get(newURL,headers = head)
type = sys.getfilesystemencoding()
aa = json.loads(html.text)
#print len(aa[1])
try:
for i in range(0,501):
print aa[2][i]['m']
except Exception,e:
break
def PL(ht):
url = 'http://www.acfun.tv/comment/bangumi/web/list?bangumiId=' + ht #评论首地址,可获得评论数,评论的
print url
jscontent = requests.get(url,headers = head).content
jsDict = json.loads(jscontent)
pag =  jsDict['data']['totalPage']
print pag
nurl = url + '&pageNo=1'
for i in range(1,pag+1):
ourl = re.sub('pageNo=\d+','pageNo=%d'%i,nurl,re.S)
jscontent = requests.get(ourl,headers = head).content
jsDict = json.loads(jscontent)

def geturl():
ourl = 'http://www.acfun.tv/bangumi/bangumi/page?pageSize=42&isWeb=1&pageNo=1&sort=1'
for i in range(1,8):
nurl = re.sub('pageNo=\d+','pageNo=%d'%i,ourl,re.S)
print nurl
jscontent = requests.get(nurl,headers = head).content
jsDict = json.loads(jscontent)
for j in range(1,42):
info( str(jsDict['data']['list'][j]['id']) )
break
break
def info(ht):
url = "http://www.acfun.tv/v/ab" + ht
sc = "http://www.acfun.tv/bangumi/stow/isStowed?bangumiId=" + ht           #收藏数
pl = "http://www.acfun.tv/bangumi/count/bangumi_view.aspx?bangumiId="+ht   #评论数
html = requests.get(url)
htpl = requests.get(pl)
title = re.findall('h3 class="title">(.*?)</h3><span',html.text,re.S)[0]
print '名称:' + title
up = re.findall('</h3><span class="last">(.*?)</span>',html.text,re.S)[0]
print '更新:'+ up
pp = re.search('\[(.*?)\]',htpl.text,re.S).group(1)
print '评论总数:' + pp
jsconten = requests.get(sc,headers = head).content
jsDict = json.loads(jsconten)
print '收藏总数:' + str(jsDict['data']['stowCount'])
jianjie = re.findall('pan class="desc">(.*?)</span>',html.text,re.S)[0]
print '简介:' + jianjie
page = re.findall('" data-count="(.*?)" data-index="',html.text,re.S)[0]
page = int(page)
nurl = url + '_1'
for i in range(1,page+1):#有多少话 多少页
nurl = re.sub('_\d+','_%d'%i,nurl,re.S)#每个话的地址
print nurl
print '第' + str(i) + '话弹幕:'
html = requests.get(nurl)
id = re.findall('data-vid="(.*?)" data-sid',html.text,re.S)[0]#获取每个话的弹幕,地址
# dm(id)
print '第' + str(i) + '话评论:'
PL(ht)
if __name__ == "__main__":
geturl()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: