抓取豆瓣小组内一定数量页面中(pic).py
2018-03-15 16:56
666 查看
#!/usr/bin/env python
# _*_ coding :'utf-8' _*_
import urllib2
from lxml import etree
import os
import re
import random
from lxml import etree
if not os.path.exists('pic_from_douban'):
os.mkdir("pic_from_douban")
url = "https://www.douban.com/group/haixiuzu/discussion?start="
agentlist =[ # no useless
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",
"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.5",
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)"
]
for start in range(0,25,25):
#the number of single douban xiaozu page is 25
user_agent =random.choice(agentlist)
header={
"Accept":"text/event-stream",
#"Accept-Encoding":"gzip, deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"no-cache",
"Connection":"keep-alive",
#"Host":"push.douban.com":"4397",
"Origin":"https://www.douban.com",
"Referer":"https://www.douban.com/group/topic/109924260/", # change this if u want
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",
}
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request).read()
print response
id_list=re.findall(r'<a href="https://www.douban.com/people/(.*?)/" ',str(response)) #every title of group
title_url_list=re.findall(r'<a href="(.*?)".*?title=',str(response))
title_list=re.findall(r'<a href=".*?".*?title="(.*)" class=.*?',str(response))
#for id in id_list:
# print id
#for title in title_list:
# print title
i = 0
for title_url in title_url_list:
id =id_list[i]
print title_url
print title_list[i]
i+=1
#user_agent = random.choice(agentlist)
#header={
# "User-Agent": user_agent
#}
title_request = urllib2.Request(title_url,headers=header)
title_response = urllib2.urlopen(title_request).read()
title_content =etree.HTML(title_response)# tiezi li de pics tai nan yong zhengze
count =0
pic_list = title_content.xpath('//*[@id="link-report"]/div/div/div/div/img/@src') # so i have to choose lxml
print str(len(pic_list))+"----------------------------------"
if len(pic_list) == 0:
pic_list = title_content.xpath('//*[@id="link-report"]/div/div/img/@src')# there are two kind of xpath about this group
print str(len(pic_list))+"***********"
for count in range(0,len(pic_list)):
print pic_list[count]
pic_url= pic_list[count]
last_name= pic_url[-4:]
pic_name= str(id)+'_'+str(count)+last_name
pic_data = urllib2.urlopen(pic_url).read()
with open('pic_from_douban/'+pic_name,'wb') as f:
f.write(pic_data)
print i
# _*_ coding :'utf-8' _*_
import urllib2
from lxml import etree
import os
import re
import random
from lxml import etree
if not os.path.exists('pic_from_douban'):
os.mkdir("pic_from_douban")
url = "https://www.douban.com/group/haixiuzu/discussion?start="
agentlist =[ # no useless
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",
"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.5",
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)"
]
for start in range(0,25,25):
#the number of single douban xiaozu page is 25
user_agent =random.choice(agentlist)
header={
"Accept":"text/event-stream",
#"Accept-Encoding":"gzip, deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"no-cache",
"Connection":"keep-alive",
#"Host":"push.douban.com":"4397",
"Origin":"https://www.douban.com",
"Referer":"https://www.douban.com/group/topic/109924260/", # change this if u want
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36",
}
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request).read()
print response
id_list=re.findall(r'<a href="https://www.douban.com/people/(.*?)/" ',str(response)) #every title of group
title_url_list=re.findall(r'<a href="(.*?)".*?title=',str(response))
title_list=re.findall(r'<a href=".*?".*?title="(.*)" class=.*?',str(response))
#for id in id_list:
# print id
#for title in title_list:
# print title
i = 0
for title_url in title_url_list:
id =id_list[i]
print title_url
print title_list[i]
i+=1
#user_agent = random.choice(agentlist)
#header={
# "User-Agent": user_agent
#}
title_request = urllib2.Request(title_url,headers=header)
title_response = urllib2.urlopen(title_request).read()
title_content =etree.HTML(title_response)# tiezi li de pics tai nan yong zhengze
count =0
pic_list = title_content.xpath('//*[@id="link-report"]/div/div/div/div/img/@src') # so i have to choose lxml
print str(len(pic_list))+"----------------------------------"
if len(pic_list) == 0:
pic_list = title_content.xpath('//*[@id="link-report"]/div/div/img/@src')# there are two kind of xpath about this group
print str(len(pic_list))+"***********"
for count in range(0,len(pic_list)):
print pic_list[count]
pic_url= pic_list[count]
last_name= pic_url[-4:]
pic_name= str(id)+'_'+str(count)+last_name
pic_data = urllib2.urlopen(pic_url).read()
with open('pic_from_douban/'+pic_name,'wb') as f:
f.write(pic_data)
print i
相关文章推荐
- 爬虫抓取豆瓣小组里的图片
- 用Scrapy抓取豆瓣小组数据(三)
- 用Scrapy抓取豆瓣小组数据(一)
- 豆瓣2100部动漫页面的网页源码(包括评分,导演,类型,简介等信息,附抓取代码)
- 用Scrapy抓取豆瓣小组数据(二)
- 利用python scrapy 框架抓取豆瓣小组数据
- 用Scrapy抓取豆瓣小组数据(三)
- Scrapy 爬虫实例 抓取豆瓣小组信息并保存到mongodb中 推荐
- 用Scrapy抓取豆瓣小组数据(二)
- 用Scrapy抓取豆瓣小组数据(一)
- 用Scrapy抓取豆瓣小组数据(二)
- 用Scrapy抓取豆瓣小组数据
- 页面上用c标签循环输出结果集时,怎么按一定数量和整齐的格式排序
- puppeteer学习(三)——抓取“相关搜索”关键词&搜索豆瓣图书榜
- Java HTML页面抓取实例
- 利用python抓取豆瓣top500的电影
- Python动态页面抓取超级指南
- php中如何实现页面抓取功能
- 简单抓取页面
- 使用java jsoup抓取页面中的数据