您的位置:首页 > 编程语言 > Python开发

python爬虫

2016-07-13 17:09 453 查看
1.控制器:

#coding=utf-8

'''
Created on 2016��7��12��

@author: wenwen.huang
'''
from spider import html_url, html_dw, html_parse, html_collect

class SpiderMain(object):

def __init__(self):
self.urlManager = html_url.Urlmanager()
self.downloader = html_dw.DownLoader()
self.paser = html_parse.Parser()
self.collecter = html_collect.Collect()

def start(self, rootUrl):
count = 1
self.urlManager.addUrl(rootUrl)
while self.urlManager.hasUrl():
try:
newUrl = self.urlManager.getUrl()
print 'start download %d --   %s' % (count, newUrl)
html = self.downloader.startLoad(newUrl)
newUrls, newData = self.paser.parse(html, newUrl)
self.urlManager.addUrls(newUrls)
self.collecter.collect(newData)
if count > 100:
break
count = count + 1
except Exception, e:
print e
print "parse fail"

self.collecter.writeFile()

if __name__ == '__main__':
rootUrl = "http://baike.baidu.com/view/21087.htm"
spider = SpiderMain()
spider.start(rootUrl)


2.url管理

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''

class Urlmanager(object):

def __init__(self):
self.newUrls = set()
self.oldurls = set()

def addUrl(self, url):
if url is None:
return
if url in self.newUrls or url in self.oldurls:
return
self.newUrls.add(url)

def hasUrl(self):
return len(self.newUrls) != 0

def getUrl(self):
url = self.newUrls.pop()
self.oldurls.add(url)
return url

def addUrls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls :
self.addUrl(url)


3.下载

#coding=utf-8
'''
Created on 2016年7月12日

@author: wenwen.huang
'''
import urllib2

class DownLoader(object):

<del>    def startLoad(self, url):
res = urllib2.urlopen(url)
if res.getcode() != 200:
return

return res.read() </del>

上面有问题,如下改进:

def startLoad(self, url):

#        req=urllib2.Request(url)

        res = urllib2.urlopen(url, timeout=1)

        if res.getcode() != 200:

            return

        

        result = res.read()

        res.close()

        return result

4.解析:

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''
from bs4 import BeautifulSoup
import re
import urlparse

class Parser(object):

#     <dd class="lemmaWgt-lemmaTitle-title">
# <h1>Python</h1>
# <a href="javascript:;" class="edit-lemma cmn-btn-hover-blue cmn-btn-28 j-edit-link" style="display: inline-block;"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_edit-lemma"></em>�༭</a>
# <a class="lock-lemma" target="_blank" href="/view/10812319.htm" title="����"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_lock-lemma"></em>����</a>
# </dd>

#         <a target="_blank" href="/view/20965.htm">�������</a>

# <div class="para" label-module="para">

def __parseUrl(self, soup, parseUrl):
newUrls = set()
links = soup.find_all("a", href=re.compile(r"/view/\d+\.htm"))
for link in links:
url = link['href']
url = urlparse.urljoin(parseUrl, url)
newUrls.add(url)

return newUrls

def __parseData(self, soup, parseUrl):
datas={}
titleNode = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
datas['title'] = titleNode.get_text()
datas['url'] = parseUrl
descNode = soup.find("div", class_="para")
datas['desc'] = descNode.get_text()
return datas

def parse(self, html, parseUrl):
if html is None or parseUrl is None:
return
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
newUrls = self.__parseUrl(soup, parseUrl)
data = self.__parseData(soup, parseUrl)

return newUrls, data


6.存储展示:

#coding=utf-8
'''
Created on 2016��7��12��

@author: wenwen.huang
'''

class Collect(object):

def __init__(self):
self.list = []

def collect(self, data):
if data is None:
return
self.list.append(data)

def writeFile(self):
fout = open("out.html", "w")

fout.write("<html>")
fout.write("<head><meta charset='utf-8'>")
fout.write('<link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet">')
fout.write("</head>")
fout.write("<body>")
fout.write('<div class="container">')
fout.write('<table class="table table-bordered">')

for data in self.list:
#             print data
fout.write("<tr border='1'>")

fout.write('<td>%s</td>' % data['url'])
title = data['title'].encode('utf-8')
print title
fout.write('<td>%s</td>' % title)
fout.write('<td>%s</td>' % data['desc'].encode('utf-8'))

#             fout.write("<td>")
#             fout.write(data['title'].encode("utf8"))
# #             print data['title']
#             fout.write("</td>")
#             fout.write("<td>")
#             fout.write(data['url'])
#             fout.write("</td>")
#             fout.write("<td>")
#             fout.write(data['desc'])
#             fout.write("</td>")
fout.write("</tr>")

fout.write("</table>")
fout.write("</div>")
fout.write("</body>")
fout.write("</html>")
#以下容易被忽略:
fout.close()


测试线程;

#coding=utf-8
'''
Created on 2016年7月13日

@author: wenwen.huang
'''
from time import sleep, ctime
import threading

class MyThread(object):

def __init__(self):
self.threads = []

def music(self, name):
for i in range(2):
print " 我在听歌, 听  %s  in  %s \n" % (name, ctime())
sleep(1)

def movie(self, name):
for i in range(2):
print "我在看电影 ,看   %s  in  %s \n" % (name, ctime())
sleep(3)

def useThread(self):
t1 = threading.Thread(target=self.music, args=('爱情买卖',))
self.threads.append(t1)
t2 = threading.Thread(target=self.movie, args=('阿凡达',))
self.threads.append(t2)
for t in self.threads:
#             t.setDaemon(True)  //打开导致子线程没执行完,就跟着主线程死亡!
t.start()

t.join()   //保证子线程执行完后,再执行主线程;

if __name__ == '__main__':
test = MyThread()
test.useThread()
print 'start all threads over !'
#     test.music('黑色幽默')
#     test.movie('血色浪漫')
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: