您的位置:首页 > 编程语言 > Python开发

python获取网页page数,同时按照href批量爬取网页(requests+BeautifulSoup)

2017-12-12 17:28 615 查看
本篇博客是上篇博客(http://blog.csdn.net/trisyp/article/details/78732630)的传参版,即通过html元素获取页面的所有href,然后逐个爬取

完整代码如下:

import requests
from bs4 import BeautifulSoup
import os
import time
import datetime as dt
from datetime import datetime

week_day_dict = ['周一', '周二', '周三', '周四', '周五', '周六', '周日']

def getHTML(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'}
r = requests.get(url, headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
r.close()
return r.text

def getPages(html):
soup = BeautifulSoup(html, 'html.parser')  # 用HTML解析网址
tag = soup.find_all('table', attrs={'class': {'m-page'}})
pageTag = tag[0].find_all('ul')
pageTag1 = pageTag[0].find_all('li')
pageCount = pageTag1[-3].text
return int(pageCount)

def getHrefs(html):
soup = BeautifulSoup(html, 'html.parser')  # 用HTML解析网址
tag = soup.find_all('table', attrs={'class': {'m-tab'}})
hrefTag = tag[0].find_all('tr')
del hrefTag[-1]
del hrefTag[-1]
hrefs = []
for i in range(len(hrefTag)):
hrefTag1 = hrefTag[i].find_all('td')
hrefTag2 = hrefTag1[-2].find_all('a')
hrefs.append(hrefTag2[0].get('href'))
return hrefs

def getTableName(html): #获取每个table的表名
soup = BeautifulSoup(html, 'html.parser')  # 用HTML解析网址
tag = soup.find_all('div', attrs={'class': {'kj-tit'}})
tableName = []
for infoi in tag:
tableName.append(infoi.text.replace("\n", "").replace(" ", ""))
return tableName

def fillUnivlist(html): #保存网页中间两个表格的内容
result = []
tableNames = getTableName(html)  # 获取表名
soup = BeautifulSoup(html, 'html.parser')  # 用HTML解析网址
timeTag = soup.find_all('div', attrs={'class': {'c-time'}})
numTag = soup.find_all('div', attrs={'class': {'c-num'}})
ctime = timeTag[0].text[29:39]
num = numTag[0].text[0:5]
ctimeweek = datetime.strptime(ctime, "%Y-%m-%d")
if week_day_dict[ctimeweek.weekday()-1]==num[0:2]:
oneday = dt.timedelta(days=1)
ctimeweek1 = ctimeweek-oneday
ctime1 = ctimeweek1.strftime("%Y-%m-%d")
dateNumbers = ctime1.replace("-", "") + num[2:5]
else:
dateNumbers = ctime + num
tag = soup.find_all('table', attrs={'class': {'kj-table'}})  # 获取所有表格
# print(str(tag[0]))
for i in range(1, 3):
infoTag = tag[i]
contentTr = infoTag.find_all('tr')
for j in range(len(contentTr)):
if j == 0:
contentTh = contentTr[j].find_all('th')
info1 = dateNumbers + "," + tableNames[i]
for infok in contentTh:
info1 = info1 + "," + infok.text.replace(" ", "")
else:
contentTd = contentTr[j].find_all('td')
info1 = dateNumbers + "," + tableNames[i]
for infok in contentTd:
info1 = info1 + "," + infok.text
result.append(info1)
return result

def writeUnivlist(result, fpath, num):
with open(fpath, 'a', encoding='utf-8') as
bf19
f: #以追加的方式存储内容
for i in range(num):
f.write(result[i] + '\n')
f.close()

def main():
startDate = input("startDate(格式为yyyy-mm-dd):")
lastDate = input("lastDate(格式为yyyy-mm-dd):")
url = "http://info.sporttery.cn/basketball/match_result.php?page=1&start_date="+startDate+"&end_date="+lastDate
html = getHTML(url)
pageNumber = getPages(html)
time.sleep(2)
hrefs = getHrefs(html)
count = 1
for i in range(2, pageNumber+1):
url = "http://info.sporttery.cn/basketball/match_result.php?page="+str(i)+"&start_date="+startDate+"&end_date="+lastDate
html = getHTML(url)
time.sleep(1)
href = getHrefs(html)
for hj in href:
hrefs.append(hj)
time.sleep(1)
count += 1
print("\r当前page进度: {:.2f}%".format(count * 100 / pageNumber), end="")

# output_href = 'D:/JCWBasketballHrefs.txt'
# writeUnivlist(hrefs, output_href, len(hrefs))

count = 0
output_file = 'D:/JCWBasketball.txt'
hrefNumber = len(hrefs)
for i in range(hrefNumber):
time.sleep(1)
result = fillUnivlist(getHTML(hrefs[i]))
time.sleep(1)
writeUnivlist(result, output_file, len(result))
count += 1
print("\r当前href进度: {:.2f}%".format(count * 100 / hrefNumber), end="")

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  爬虫 python html