您的位置:首页 > 编程语言 > Python开发

Python 爬虫(获取小说)

2016-10-14 17:15 357 查看
以《笔趣阁》小说网站为例

需求:python3版本以上

安装方法如下:

先安装python3-pip,然后检查下版本,如果版本可以升级,就--upgrade pip 一下,然后再安装beautifulsoup4

sudo apt-get install python3-pip
pip3 --version
pip3 install --upgrade pip
pip3 install beautifulsoup4

代码如下:

#!/usr/bin/env python3

from urllib import request,parse
from bs4 import BeautifulSoup
import time

def search_book(bookname):
url = 'http://www.biquge5200.com/modules/article/search.php?searchkey=' + parse.quote(bookname)
response = request.urlopen(url)
content = response.read().decode('gbk')
soup = BeautifulSoup(content,'html.parser')
menu = []
key = 0
for row in soup.find('table').find_all('tr'):
td1 = row.select('td:nth-of-type(1)')
td3 = row.select('td:nth-of-type(3)')
if(td1 and td3):
name    = td1[0].find('a').string
href    = td1[0].find('a').get('href')
author  = td3[0].string
menu.append({'name':name,'href':href})
print(str(key) + ' 书名:' + name + ' >> 作者:' + author)
key += 1
if(menu):
select_key = -1
while(select_key >= key or select_key < 0):
select_key = int(input('请输入你要下载的小说序号:'))
return menu[int(select_key)]
return []

def get_novel_menu(url):
response = request.urlopen(url)
content = response.read().decode('gbk')
soup = BeautifulSoup(content, 'html.parser')
list = []
for dd in soup.find('div',id="list").find('dt').find_next('dt').find_all_next('dd'):
title = dd.find('a').string
href = dd.find('a').get('href')
list.append({'title':title,'href':href})
return list

def get_novel_content(title,url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
response = request.urlopen(request.Request(url,headers=headers))
content = response.read().decode('gbk')
soup = BeautifulSoup(content, 'html.parser')
text = soup.find('div',id="content").get_text()
return title + "\r\n" + text
info = []
while(info == []):
bookname = input('请输入你要查找的小说名:')
info = search_book(bookname)
menu_lists = get_novel_menu(info['href'])
if(menu_lists == []):
print('该小说没有可供下载的目录')
exit(0)
for list in menu_lists:
print('正在下载:' + list['title'])
content = get_novel_content(list['title'],list['href'])
f = open('/home/novel.txt', 'a')
f.write(content)
f.close()
time.sleep(0.1)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: