您的位置:首页 > 其它

7、批量关键字百度搜索结果url解码

2017-03-05 12:23 776 查看
import requests
from bs4 import BeautifulSoup
import re
import time

#coding:utf-8

with open('key.txt','r') as f:
result = f.read()
keys = result.split('\n')
key_words = list(enumerate(keys, start=1))

for key in key_words:
url = 'https://www.baidu.com/s?wd='+ key[1]

header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Cookie':'PSTM=1476231684; BIDUPSID=4F526560482E2A5E68D69CC8B0998806; plus_cv=1::m:92e3c68f; BAIDUID=C5A710455602AEA5BEC3D1B13B26321B:FG=1;'
' BDUSS=W5zS3JSeVYwSHZjVm5SdTdjQjlKNC1FLWJqbklvaEptZjVZVkl2bXhMN1o1amhZSVFBQUFBJCQAAAAAAAAAAAEAAACj2nZjanVleWluZ3MAAAAAAAAAAAAAAAAAAAA'
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANlZEVjZWRFYT; BD_HOME=1; BD_UPN=12314353; sug=3; sugstore=0; ORIGIN=2; bdime=0;'
' H_PS_645EC=78d5XI4%2Bj6NkSjLKSmkiYdx%2F5jHNa0c4UemYz6WwEpyczIPebiQwaLtzwnXd2gUHv28P; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1;'
' PSINO=6; H_PS_PSSID=1448_18288_21112_17001_20241_21455_21406_21394_21377_21192_20929; BDSVRTM=0'
}

web_db = requests.get(url,headers=header)
time.sleep(2)
soup = BeautifulSoup(web_db.text,'lxml')

titles = soup.select('#content_left > div > h3 > a')
ranks = [ i for i in range(1,11)]

for title,link,rank in zip(titles,titles,ranks):

baidu_url = link.get('href')
if str(baidu_url).find('link?url=') > 0 :
web_db2 = requests.get(baidu_url, allow_redirects=False)
if web_db2.status_code == 200:
soup = BeautifulSoup(web_db2.text, 'lxml')
urls = soup.select('head > noscript')
url2 = urls[0]
url_math = re.search(r'\'(.*?)\'', str(url2), re.S)
web_url = url_math.group(1)
elif web_db2.status_code == 302:
web_url = web_db2.headers['location']
else:
web_url = 'error'
else:
web_url = baidu_url

data = {
'key':key,
'title':title.get_text(),
'url':web_url.encode('utf-8'),
'rank':rank,
}
with open('info.txt','a') as f:
f.write(str(data)+'\n')
print('已完成采集任务' + str(key[0]) + '**********总采集任务' + str(len(key_words)))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  url 搜索 百度
相关文章推荐