您的位置:首页 > 其它

抓取豆瓣小组内一定数量页面中(pic).py

2018-03-15 16:56 666 查看
#!/usr/bin/env python
# _*_  coding :'utf-8' _*_

import  urllib2
from lxml import  etree
import os
import re
import random
from lxml import etree

if not os.path.exists('pic_from_douban'):
    os.mkdir("pic_from_douban")

url = "https://www.douban.com/group/haixiuzu/discussion?start="
agentlist =[ # no useless
    "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",
    "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.5",
    "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
    "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
    "Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11",
    "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
    "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)",
    "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)",
    "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)",
    "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)"
]

for start in range(0,25,25):
#the number of  single douban  xiaozu page  is 25
    user_agent =random.choice(agentlist)
    header={
        "Accept":"text/event-stream",
        #"Accept-Encoding":"gzip, deflate, sdch, br",
        "Accept-Language":"zh-CN,zh;q=0.8",
        "Cache-Control":"no-cache",
        "Connection":"keep-alive",
        #"Host":"push.douban.com":"4397",
        "Origin":"https://www.douban.com",
        "Referer":"https://www.douban.com/group/topic/109924260/",    # change this if u want 
        "User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",

    }
    request = urllib2.Request(url,headers=header)
    response =  urllib2.urlopen(request).read()
    print response

    id_list=re.findall(r'<a href="https://www.douban.com/people/(.*?)/" ',str(response))  #every title of group
    title_url_list=re.findall(r'<a href="(.*?)".*?title=',str(response))
    title_list=re.findall(r'<a href=".*?".*?title="(.*)" class=.*?',str(response))
    #for id in id_list:
     #   print id
    #for title in title_list:
    #    print title
    i = 0
    for title_url in title_url_list:

        id =id_list[i]

        print title_url
        print title_list[i]
        i+=1
        #user_agent = random.choice(agentlist)
        #header={
        #    "User-Agent": user_agent
        #}
        title_request = urllib2.Request(title_url,headers=header)
        title_response = urllib2.urlopen(title_request).read()
        title_content =etree.HTML(title_response)#   tiezi li de  pics  tai nan yong zhengze
        count =0
        pic_list = title_content.xpath('//*[@id="link-report"]/div/div/div/div/img/@src')    # so i have to choose lxml
        print str(len(pic_list))+"----------------------------------"
        if len(pic_list) == 0:
            pic_list = title_content.xpath('//*[@id="link-report"]/div/div/img/@src')# there  are two kind of  xpath about this group
        print str(len(pic_list))+"***********"
        for count in range(0,len(pic_list)):
            print pic_list[count]
            pic_url= pic_list[count]
            last_name= pic_url[-4:]
            pic_name= str(id)+'_'+str(count)+last_name
            pic_data = urllib2.urlopen(pic_url).read()
            with open('pic_from_douban/'+pic_name,'wb') as f:
                f.write(pic_data)

print i
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: