您的位置:首页 > 编程语言

随想录(改进的豆瓣爬行代码)

2016-03-31 23:06 274 查看
【 声明:版权所有,欢迎转载,请勿用于商业用途。 联系信箱:feixiaoxing @163.com】

    过去使用python机会不多,因此导致编写的python代码不够健壮。这次趁着项目的机会,将之前的豆瓣爬虫代码重新修改了一下,一下子健壮了很多。放在这里,留给需要的朋友。

#encoding=utf-8
#!/usr/bin/python

import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
import threading
import thread

from urllib import urlopen

page=[]
num = 0
lock = threading.Lock()

def check_num_exist(data):

global page

for i in range(len(page)):
if data==page[i]:
return True
return False

def get_num():

global page
global num

lock.acquire()

if len(page) <= num:
lock.release()
return -1
val = page[num]
num +=1
lock.release()

return val

def process_num(data):

global page

lock.acquire()
val = []
for i in range(len(data)):
if True == check_num_exist(data[i]):
continue
page.append(data[i])

lock.release()

def thread_handler(param1, param2):

param1 = param1
param2 = param2

while True:

time.sleep(2)

val = get_num()
if val == -1:
time.sleep(3)
continue

'''produce url address'''
url = 'https://movie.douban.com/subject/' + val + '/?from=subject-page'

'''get web data '''
req = urllib2.Request(url)
if 1 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36')
elif 2 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; EP1T Build/MR1) AppleWebKit/534.30(KHTML, like Gecko) Version/4.0 Safari/534.30')
else:
req.add_header('User-Agent','Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10')

try:
request = urllib2.urlopen(req)

except urllib2.URLError, e:
print e.reason
continue

except urllib2.HTTPError, e:
print e.reason
continue

try:
webdata = request.read()
except:
time.sleep(10)
continue

'''get title '''
find=re.search(r'<title>\n(.*?)\(.*?\)\n</title>',webdata)
if( None == find):
continue

title = find.group(1).strip().decode('utf-8')

'''get score'''
find=re.search(r'<strong class=.*? property=.*?>(\d\.\d)',webdata)
if( None == find):
continue

score = find.group(1)

'''print info about the film '''
lock.acquire()
print ('%s %s %s') %(url,title,score)
lock.release()

'''print webdata'''
find=re.findall(r'https://movie.douban.com/subject/(\d{7,8})', webdata)
if( 0 == len(find)):
continue

process_num(find)

if __name__ == '__main__':

page.append('25798222')

for i in range(1):
thread.start_new_thread(thread_handler, (1,1))

while True:
time.sleep(5)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: