您的位置:首页 > 编程语言 > Python开发

初学爬虫一、获取某分类下所有页地址

2017-01-09 20:29 447 查看
import requests
from bs4 import BeautifulSoup
import sys
sys.encoding = "utf8"
class PageCatch(object):
def __init__(self,host,shortUrl):
self.host = host;   #域名+分类
self.shortUrl = shortUrl;  #具体请求页
self.url = host + shortUrl;  #完整URL
def __getPageContent(self):
#获取URL内容#
req = requests.get(self.url);
if req.status_code == 200:
req.encoding = "gb2312";
strText = req.text;
return strText;
else:
return "";
def __getMaxPageNumAndUrl(self):
reqContent = self.__getPageContent();
#获取分页地址 分页url 形如 list45_2.html 2为页号#
soup = BeautifulSoup (reqContent,"html.parser");
for ul in soup.select(".plist"):
maxPageNum = ul.select("strong")[0].text;
alink =  ul.select("a");
if alink[-1]['href'] == "#":
return int(maxPageNum),alink[1]['href'];
return 0;
def __formatPage(self,pageNum):
#格式化url  形如 list45_2.html#
lineBeginSite = self.shortUrl.index("_")+1;
docBeginSite = self.shortUrl.index(".");
return self.shortUrl[:lineBeginSite]+str(pageNum+1)+self.shortUrl[docBeginSite:];
def getBookPageList(self):
#获取书籍每页的URL#
shortPageList = [];
maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
for i in range(maxPageNum):
shortPageList.append(self.host + self.__formatPage(i));
return shortPageList;
if __name__ == '__main__':
p = PageCatch("http://www.jb51.net/books/","list152_35.html");
shortPageList = p.getBookPageList();
print (shortPageList);
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Python 爬虫