python获取网页page数,同时按照href批量爬取网页(requests+BeautifulSoup)
2017-12-12 17:28
615 查看
本篇博客是上篇博客(http://blog.csdn.net/trisyp/article/details/78732630)的传参版,即通过html元素获取页面的所有href,然后逐个爬取
完整代码如下:
完整代码如下:
import requests from bs4 import BeautifulSoup import os import time import datetime as dt from datetime import datetime week_day_dict = ['周一', '周二', '周三', '周四', '周五', '周六', '周日'] def getHTML(url): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'} r = requests.get(url, headers) r.raise_for_status() r.encoding = r.apparent_encoding r.close() return r.text def getPages(html): soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址 tag = soup.find_all('table', attrs={'class': {'m-page'}}) pageTag = tag[0].find_all('ul') pageTag1 = pageTag[0].find_all('li') pageCount = pageTag1[-3].text return int(pageCount) def getHrefs(html): soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址 tag = soup.find_all('table', attrs={'class': {'m-tab'}}) hrefTag = tag[0].find_all('tr') del hrefTag[-1] del hrefTag[-1] hrefs = [] for i in range(len(hrefTag)): hrefTag1 = hrefTag[i].find_all('td') hrefTag2 = hrefTag1[-2].find_all('a') hrefs.append(hrefTag2[0].get('href')) return hrefs def getTableName(html): #获取每个table的表名 soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址 tag = soup.find_all('div', attrs={'class': {'kj-tit'}}) tableName = [] for infoi in tag: tableName.append(infoi.text.replace("\n", "").replace(" ", "")) return tableName def fillUnivlist(html): #保存网页中间两个表格的内容 result = [] tableNames = getTableName(html) # 获取表名 soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址 timeTag = soup.find_all('div', attrs={'class': {'c-time'}}) numTag = soup.find_all('div', attrs={'class': {'c-num'}}) ctime = timeTag[0].text[29:39] num = numTag[0].text[0:5] ctimeweek = datetime.strptime(ctime, "%Y-%m-%d") if week_day_dict[ctimeweek.weekday()-1]==num[0:2]: oneday = dt.timedelta(days=1) ctimeweek1 = ctimeweek-oneday ctime1 = ctimeweek1.strftime("%Y-%m-%d") dateNumbers = ctime1.replace("-", "") + num[2:5] else: dateNumbers = ctime + num tag = soup.find_all('table', attrs={'class': {'kj-table'}}) # 获取所有表格 # print(str(tag[0])) for i in range(1, 3): infoTag = tag[i] contentTr = infoTag.find_all('tr') for j in range(len(contentTr)): if j == 0: contentTh = contentTr[j].find_all('th') info1 = dateNumbers + "," + tableNames[i] for infok in contentTh: info1 = info1 + "," + infok.text.replace(" ", "") else: contentTd = contentTr[j].find_all('td') info1 = dateNumbers + "," + tableNames[i] for infok in contentTd: info1 = info1 + "," + infok.text result.append(info1) return result def writeUnivlist(result, fpath, num): with open(fpath, 'a', encoding='utf-8') as bf19 f: #以追加的方式存储内容 for i in range(num): f.write(result[i] + '\n') f.close() def main(): startDate = input("startDate(格式为yyyy-mm-dd):") lastDate = input("lastDate(格式为yyyy-mm-dd):") url = "http://info.sporttery.cn/basketball/match_result.php?page=1&start_date="+startDate+"&end_date="+lastDate html = getHTML(url) pageNumber = getPages(html) time.sleep(2) hrefs = getHrefs(html) count = 1 for i in range(2, pageNumber+1): url = "http://info.sporttery.cn/basketball/match_result.php?page="+str(i)+"&start_date="+startDate+"&end_date="+lastDate html = getHTML(url) time.sleep(1) href = getHrefs(html) for hj in href: hrefs.append(hj) time.sleep(1) count += 1 print("\r当前page进度: {:.2f}%".format(count * 100 / pageNumber), end="") # output_href = 'D:/JCWBasketballHrefs.txt' # writeUnivlist(hrefs, output_href, len(hrefs)) count = 0 output_file = 'D:/JCWBasketball.txt' hrefNumber = len(hrefs) for i in range(hrefNumber): time.sleep(1) result = fillUnivlist(getHTML(hrefs[i])) time.sleep(1) writeUnivlist(result, output_file, len(result)) count += 1 print("\r当前href进度: {:.2f}%".format(count * 100 / hrefNumber), end="") if __name__ == '__main__': main()
相关文章推荐
- python BeautifulSoup获取 网页链接的文字内容
- python3 requests 获取网页时中文乱码
- 使用python抓取并分析数据—链家网(requests+BeautifulSoup)(转)
- Python网页解析:BeautifulSoup vs lxml.html
- python简单爬虫开发(urllib2、requests + BeautifulSoup)
- python eclipse 插件安装 及BeautifulSoup requests selenium在线安装 PhantomJS 安装 环境配置
- python+requests+beautifulsoup爬取大众点评评论信息
- 【Python爬虫】requests+Beautifulsoup存入数据库
- Python练习 requests+BeautifulSoup抓取ZD页面
- Python 爬虫—— requests BeautifulSoup
- 使用Python+selenium+BeautifulSoup抓取动态网页的关键信息
- python3+BeautifulSoup+tkinter 爬虫 获取学校成绩
- python基础学习第五课,大批量获取网页数据基础,requests模块尝试
- 萌新的Python学习日记 - 爬虫无影 - 使用BeautifulSoup + css selector 抓取动态网页内容:Knewone
- Python Beautiful Soup+requests实现爬虫
- Python requests+gevent+BeautifulSoup lxml 干点啥-加点速
- 使用requests+beautifulsoup模块实现python网络爬虫功能
- Python + Requests + BeautifulSoup每日BUG汇总
- Python爬虫知识(1)——scrapy vs requests+BeautifulSoup
- python 爬虫试手 requests+BeautifulSoup