您的位置:首页 > 编程语言 > Python开发

python 网页爬虫,带登陆信息

2018-01-26 20:04 363 查看
注意点:1. 用Fiddler抓取登陆后的headers,cookies;2. 每抓取一次网页暂停一点时间防止反爬虫;3. 抓取前,需要关闭Fiddler以防止端口占用.
还需解决的问题:爬取记录较多时,会触发反爬虫机制。
用Fiddler抓取登陆后的headers,cookies



也可使用火狐F12查看



#-*- coding: utf-8 -*-
import sys
import time
import urllib
import bs4
import re
import random

import requests

def main(startUrl):
print(startUrl)

global csvContent

headers = {'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

cookies = {
'_csrf':'iN90P1mtdXxv/ZWpt8W8kg==',
'_csrf_bk':'b095b5ac898229ebf3adc8f0e901523a',
'aliyungf_tc':'AQAAAAoHdhpO9Q4AHJUE2sFxGtgWCuH9',
'auth_token':'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODU1MDEzNTUyMSIsImlhdCI6MTUxNzE5MTI3OSwiZXhwIjoxNTMyNzQzMjc5fQ.z9l-sSAyPlLFsD97Yrs7khD1dRBCyyByb-sijUgorQzgR5HdVykD1_W_gn8R2aZSUSRhR_Dq0jPNEYPJlI22ew',
'bannerFlag':'true',
'csrfToken':'9_lfoqS9eAThxvDa8XjDHA6B',
'Hm_lpvt_e92c8d65d92d534b0fc290df538b4758':'1517191269',
'Hm_lvt_e92c8d65d92d534b0fc290df538b4758':'1516864063',

'OA':'TkU7nzii8Vwbw4JYrV6kjTg0WS645VnS6CIervVVizo=',
'ssuid':'360989088',
'TYCID':'709b5a10019e11e89c185fb756815612',
'tyc-user-info':'%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODU1MDEzNTUyMSIsImlhdCI6MTUxNzE5MTI3OSwiZXhwIjoxNTMyNzQzMjc5fQ.z9l-sSAyPlLFsD97Yrs7khD1dRBCyyByb-sijUgorQzgR5HdVykD1_W_gn8R2aZSUSRhR_Dq0jPNEYPJlI22ew%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252218550135521%2522%257D',
'undefined':'709b5a10019e11e89c185fb756815612'

}

resultPage = requests.get(startUrl, headers= headers, cookies = cookies)

randomTime= random.random()*10+5
print('randomTime '+str(randomTime))
time.sleep(randomTime)

soup = bs4.BeautifulSoup(resultPage.text,'html.parser')

industry = soup.find_all(attrs={'class': 'in-block overflow-width vertival-middle sec-c2'})[0].string;

companys= soup.find_all(attrs={'class': 'search_right_item ml10'})

for company in companys:
tempCsvContent=''
tempCsvContent+=industry+','
tempCsvContent+=company.contents[0].a.string+','

# if(company.contents[0].a.string=='昆山市大千园艺场'):
# break;

for child in company.contents[1].div.children:
content= str(child.get_text);

if None!=re.search("法定代表人",content):
try:
tempCsvContent+=child.a.string+','
except:
tempCsvContent+=','
elif None!=re.search("注册资本",content):
try:
tempCsvContent+=child.span.string+','
except:
tempCsvContent+=','
elif None!=re.search("注册时间",content):
try:
tempCsvContent+=child.span.string+','
except:
tempCsvContent+=','
elif None!=re.search("江苏",content):
try:
tempCsvContent+=re.match('^.*?f20">(\d+).*$',content).group(1)+','
except:
tempCsvContent+=','
else:
None

try:
tempCsvContent+=company.contents[0].a.attrs['href'] +','

link = company.contents[0].a.attrs['href']

linkResult = requests.get(link, headers= headers, cookies = cookies)

randomTime2= random.random()*10+5
print('randomTime 2 '+str(randomTime2)+' '+link)
time.sleep(randomTime2)

linkSoup = bs4.BeautifulSoup(linkResult.text,'html.parser')

location = linkSoup.find_all(attrs={'colspan': '4'})[0].t
9fc2
ext.replace('附近公司','');
tempCsvContent+=location+',';

selfRisk = linkSoup.find(attrs={'class': 'new-err selfRisk pl5 pr5'}).string;
tempCsvContent+=selfRisk+',';

roundRisk = linkSoup.find(attrs={'class': 'new-err roundRisk pl5 pr5'}).string;
tempCsvContent+=roundRisk+',';

riskItems = linkSoup.find(attrs={'class': 'navigation new-border-top new-border-right new-c3 js-company-navigation'}).find(attrs={'class': 'over-hide'}).find_all(attrs={'class': 'float-left f14 text-center nav_item_Box'});

for content in riskItems[2].contents[1]:
value = str(content)
try:
if('<span class="c9">' in value):
tempCsvContent+=content.span.string+',';
else:
tempCsvContent+='0'+',';
except:
tempCsvContent+='0'+',';

for content in riskItems[3].contents[1]:
value = str(content)
try:
if('<span class="c9">' in value):
tempCsvContent+=content.span.string+',';
else:
tempCsvContent+='0'+',';
except:
tempCsvContent+='0'+',';
for content in riskItems[4].contents[1]:
value = str(content)
try:
if('<span class="c9">' in value):
tempCsvContent+=content.span.string+',';
else:
tempCsvContent+='0'+',';
except:
tempCsvContent+='0'+',';

for content in riskItems[5].contents[1]:
value = str(content)
try:
if('<span class="c9">' in value):
tempCsvContent+=content.span.string+',';
else:
tempCsvContent+='0'+',';
except:
tempCsvContent+='0'+',';

tempCsvContent=tempCsvContent.rstrip(',')
tempCsvContent+='\r'

csvContent+=tempCsvContent
except:
print('exception')
tempCsvContent=''

print(csvContent)
print()
print()
print()
print()
print()

if __name__ == '__main__':

for i in range(3,4):

name=str(i).zfill(2)
file = open('D:\\result-'+name+'.csv','w')
csvContent='行业分类,企业描述,法定代表人,注册资本,注册时间,分数, 细节, 注册地址, 天眼风险-自身风险, 天眼风险-周边风险, 法律诉讼, 法院公告, 失信人, 被执行人, 开庭公告, 经营异常, 行政处罚, 严重违法,股权出质,动产抵押,欠税公告,司法拍卖, 招投标,债券信息,购地信息,招聘,税务评级,抽查检查,产品信息,进出口信用,资质证书,微信公众号,商标信息,专利,软件著作权,作品著作权,网站备案\r'

for j in range(1,6):

# randomTime= random.random()*10+10
# print('randomTime header '+str(randomTime))
# time.sleep(randomTime)

main('https://szh.tianyancha.com/search/oc'+str(i).zfill(2)+'/p'+str(j))
file.write(csvContent)
file.close
csvContent=''

print(csvContent)


运行结果示例



代码链接
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: