您的位置:首页 > 理论基础 > 计算机网络

网络爬虫 学习笔记

2017-05-13 22:44 190 查看
1. 制作新浪新闻网络爬虫  http://study.163.com/course/courseMain.htm?courseId=1003285002 利用chrome浏览器, 检查,---Network--doc  重新载入  第一个选择检查元素,查到对应的标签import requestsfrom bs4 import BeautifulSoupnewsurl='http://news.sina.com.cn/china/'res=requests.get(newsurl)    #用chrome查看,知道是get方法,在此可以做各agentres.encoding='UTF-8'    #防止乱码’ print res.text     #中途检查结构soup=BeautifulSoup(res.text,'html.parser')    #‘html.parser    for news in soup.select('.news-item'):    #取id标号后的内容时候 加 "#", class 标号后的内容的时候,加 "."    if len(news.select('h2'))>0:        h2= news.select('h2')[0].text        time=news.select('.time')[0].text        a=news.select('a')[0]['href']           #注意此处取的是连接,如果取内容直接text        print h2,a,time   ----------------------------------------------------------------------------------------import requestsfrom datetime import datetimefrom bs4 import BeautifulSoupnewsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'res=requests.get(newsurl)res.encoding='UTF-8'soup=BeautifulSoup(res.text,'html.parser')bodyTitle=soup.select('#artibodyTitle')[0].texttimesource=soup.select('.time-source')[0].contents[0].strip()#dt=datetime.strptime(timesource,'%Y%m%d%H:%M')source=soup.select('.time-source span a')[0].textarticle=[]for p in soup.select('#artibody p')[:-1]:    article.append(p.text.strip())''.join(article)''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])#soup.select('.article-editor')[0].text.lstrip('责任编辑')--------------------------------------------------------------------------------------------------------------------------查找js部分找到连接import requestsfrom bs4 import BeautifulSoupnewsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\&channel=gn&newsid=comos-fyfecvz1234039&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'comments=requests.get(newsurl)comments.encoding='UTF-8'import jsonjd=json.loads(comments.text.strip('var data='))jd['result']['count']['total']---------------------------------------------------------------------------------------------newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'newid=newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')print newidimport rem=re.search('doc-i(.*).shtml',newsurl)newsid=m.group(1)-----------------------------------------------------------------------------------------------------------import reimport jsoncommentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'def getCommentCounts(newsurl):    m=re.search('doc-i(.+).shtml',newsurl)    newsid=m.group(1)    comments=requests.get(commentsurl.format(newsid))  #把newsid套入commentsurl的{}    jd=json.loads(comments.text.strip('var data='))    return jd['result']['count']['total']news='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'getCommentCounts(news)------------------------------------------------------------------------------最终结果import requestsfrom bs4 import BeautifulSoupdef getNewsDetail(newsurl):    result={}    res=requests.get(newsurl)    res.encoding='utf-8'    soup=BeautifulSoup(res.text,'html.parser')    result['title']=soup.select('#artibodyTitle')[0].text    result['newssource']=soup.select('.time-source')[0].text    result['comments']=getCommentCounts(newsurl)    return resultgetNewsDetail('http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml')2. Python开发简单爬虫 http://www.imooc.com/video/10680 获取内容import urllib2#直接请求response=urllib2.urlopen('http://www.baidu.com')#获取状态码,200表示成功print response.getcode()cont=response.read()-------------------import urllib2url='http://www.baidu.com'request=urllib2.Request(url)request.add_data('a')request.add_header('User-Agent','Mozilla/5.0')response=urllib2.urlopen(request)-------------------------------------import urllib2,cookielib#创建cookie容器cj=cookielib.CookieJar()#创建1个openeropener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#给urllib2安装openerurllib2.install_opener(opener)response=urllib2.urlopen('http://www.baidu.com')--------------------------------------------------------------------------------------------------------from bs4 import BeautifulSoupimport rehtml_doc ='''<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>'''# 根据html创建BeautifulSoup对象soup=BeautifulSoup(html_doc, #文档字符串                   'html.parser', #解析器                  from_encoding='utf=8')links=soup.find_all('a')for link in links:    print link.name,link['href'],link.get_text()link_node=soup.find('a',href="http://example.com/lacie")print link_node.name,link['href'],link.get_text()        link_node=soup.find('a',href=re.compile(r'ill'))print link_node.name,link['href'],link.get_text()p_node=soup.find('p',class_='title')  #class是PYthon的关键词,所以加_区分print p_node.name,p_node.get_text()----------------------------------------------------------------------------------------最终Crawler_mian.py
import URLManager, HTMLDownloader, HTMLParser, HTMLOutputer
class CrawlerMain(object):
def __init__(self):
self.urls = URLManager.UrlManager()                     # 初始化URL管理器
self.downloader = HTMLDownloader.HtmlDownloader()       # 初始化HTML下载器
self.parser = HTMLParser.HtmlParser()                   # 初始化HTML解析器
self.outputer = HTMLOutputer.HtmlOutputer()             # 初始化HTML输出器
pass    def crawl(self,root_url):
count = 1        # 爬取计数
self.urls.add_new_url(root_url)                 # 将入口URL添加进管理器
while self.urls.has_new_url():                  # 若URL池不为空则进行爬取
try:
new_url = self.urls.get_new_url()           # 获取要下载的URL
print('crawl %d: %s' % (count, new_url))    # 打印正在爬取第几个页面及其URL
html_cont = self.downloader.download(new_url)      # 下载页面
new_urls, new_data = self.parser.hparse(new_url, html_cont)  # 获取新的URL列表和页面数据
self.urls.add_new_urls(new_urls)            # 将新的URL列表添加进管理器
self.outputer.collect_data(new_data)        # 收集数据
if count == 10:
break
count = count + 1
except:
print('Crawl Failed')
self.outputer.output_html()   # 将数据输出为HTML
passif __name__ == '__main__':
root_url = "http://baike.baidu.com/item/Python"         # 入口URL
obj_crawler = CrawlerMain()                             # 创建爬虫实例
obj_crawler.crawl(root_url)                             # 调用爬虫方法
---
HTMLDownloader.py
from urllib import request
class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
# 打开网页
response = request.urlopen(url)
if response.getcode() != 200:
# 打开失败返回None
return None
else:
# 打开成功返回网页内容
return response.read().decode("utf-8")
----
URLManager.py
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, url):
if url is None:
return
        if url not in self.new_urls and url not in self.old_urls:
# 如果该URL没有被/添加访问过就添加进管理器
self.new_urls.add(url)
pass    def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
        for url in urls:
# 将URL列表添加进待访问队列
self.new_urls.add(url)
pass    def has_new_url(self):
# 返回URL池是否为空
return len(self.new_urls) != 0
pass    def get_new_url(self):
# 从未访问的URL中取出一个,并返回取出的URL
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
pass
------------
HTMLParser.py
from bs4 import BeautifulSoup
import re
from urllib import parse
class HtmlParser(object):
# page_url为页面URL, html_cont为获取的页面内容
def hparse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return# BeautifulSoup解析网页内容
soup = BeautifulSoup(html_cont, 'html.parser')
# 获取页面内容包含的URLs
new_urls = self._get_new_urls(page_url, soup)
# 获取页面内容中想要爬取的数据
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
pass    def _get_new_urls(self, page_url, soup):
new_urls = set()
# 正则表达式模糊匹配
links = soup.find_all('a', href=re.compile(r"/item/"))
for link in links:
new_url = link['href']
# 连接成网址
new_full_url = parse.urljoin(page_url, new_url)new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> title_node = soup.find('dd', class_ = "lemmaWgt-lemmaTitle-title").find("h1") res_data['title'] = title_node.get_text() # <div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_ = "lemma-summary" ) res_data['summary'] = summary_node.get_text() return res_data pass------------
HTMLOutputer.py
class HtmlOutputer(object):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append(data)
pass    def output_html(self):
fout = open('output.html', 'w', encoding='utf-8')

fout.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
fout.write("<html>")
fout.write("<body>")
fout.write("<table>")

for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data['url'])
fout.write("<td>%s</td>" % data['title'])
fout.write("<td>%s</td>" % data['summary'])
fout.write("</tr>")

fout.write("</html>")
fout.write("</body>")
fout.write("</table>")
pass

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息