随想录(改进的豆瓣爬行代码)
2016-03-31 23:06
274 查看
【 声明:版权所有,欢迎转载,请勿用于商业用途。 联系信箱:feixiaoxing @163.com】
过去使用python机会不多,因此导致编写的python代码不够健壮。这次趁着项目的机会,将之前的豆瓣爬虫代码重新修改了一下,一下子健壮了很多。放在这里,留给需要的朋友。
#encoding=utf-8
#!/usr/bin/python
import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
import threading
import thread
from urllib import urlopen
page=[]
num = 0
lock = threading.Lock()
def check_num_exist(data):
global page
for i in range(len(page)):
if data==page[i]:
return True
return False
def get_num():
global page
global num
lock.acquire()
if len(page) <= num:
lock.release()
return -1
val = page[num]
num +=1
lock.release()
return val
def process_num(data):
global page
lock.acquire()
val = []
for i in range(len(data)):
if True == check_num_exist(data[i]):
continue
page.append(data[i])
lock.release()
def thread_handler(param1, param2):
param1 = param1
param2 = param2
while True:
time.sleep(2)
val = get_num()
if val == -1:
time.sleep(3)
continue
'''produce url address'''
url = 'https://movie.douban.com/subject/' + val + '/?from=subject-page'
'''get web data '''
req = urllib2.Request(url)
if 1 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36')
elif 2 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; EP1T Build/MR1) AppleWebKit/534.30(KHTML, like Gecko) Version/4.0 Safari/534.30')
else:
req.add_header('User-Agent','Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10')
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
print e.reason
continue
except urllib2.HTTPError, e:
print e.reason
continue
try:
webdata = request.read()
except:
time.sleep(10)
continue
'''get title '''
find=re.search(r'<title>\n(.*?)\(.*?\)\n</title>',webdata)
if( None == find):
continue
title = find.group(1).strip().decode('utf-8')
'''get score'''
find=re.search(r'<strong class=.*? property=.*?>(\d\.\d)',webdata)
if( None == find):
continue
score = find.group(1)
'''print info about the film '''
lock.acquire()
print ('%s %s %s') %(url,title,score)
lock.release()
'''print webdata'''
find=re.findall(r'https://movie.douban.com/subject/(\d{7,8})', webdata)
if( 0 == len(find)):
continue
process_num(find)
if __name__ == '__main__':
page.append('25798222')
for i in range(1):
thread.start_new_thread(thread_handler, (1,1))
while True:
time.sleep(5)
过去使用python机会不多,因此导致编写的python代码不够健壮。这次趁着项目的机会,将之前的豆瓣爬虫代码重新修改了一下,一下子健壮了很多。放在这里,留给需要的朋友。
#encoding=utf-8
#!/usr/bin/python
import os
import sys
import re
import time
import smtplib
import urllib
import urllib2
import traceback
import threading
import thread
from urllib import urlopen
page=[]
num = 0
lock = threading.Lock()
def check_num_exist(data):
global page
for i in range(len(page)):
if data==page[i]:
return True
return False
def get_num():
global page
global num
lock.acquire()
if len(page) <= num:
lock.release()
return -1
val = page[num]
num +=1
lock.release()
return val
def process_num(data):
global page
lock.acquire()
val = []
for i in range(len(data)):
if True == check_num_exist(data[i]):
continue
page.append(data[i])
lock.release()
def thread_handler(param1, param2):
param1 = param1
param2 = param2
while True:
time.sleep(2)
val = get_num()
if val == -1:
time.sleep(3)
continue
'''produce url address'''
url = 'https://movie.douban.com/subject/' + val + '/?from=subject-page'
'''get web data '''
req = urllib2.Request(url)
if 1 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36')
elif 2 == int(val) % 3:
req.add_header('User-Agent','Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; EP1T Build/MR1) AppleWebKit/534.30(KHTML, like Gecko) Version/4.0 Safari/534.30')
else:
req.add_header('User-Agent','Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10')
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
print e.reason
continue
except urllib2.HTTPError, e:
print e.reason
continue
try:
webdata = request.read()
except:
time.sleep(10)
continue
'''get title '''
find=re.search(r'<title>\n(.*?)\(.*?\)\n</title>',webdata)
if( None == find):
continue
title = find.group(1).strip().decode('utf-8')
'''get score'''
find=re.search(r'<strong class=.*? property=.*?>(\d\.\d)',webdata)
if( None == find):
continue
score = find.group(1)
'''print info about the film '''
lock.acquire()
print ('%s %s %s') %(url,title,score)
lock.release()
'''print webdata'''
find=re.findall(r'https://movie.douban.com/subject/(\d{7,8})', webdata)
if( 0 == len(find)):
continue
process_num(find)
if __name__ == '__main__':
page.append('25798222')
for i in range(1):
thread.start_new_thread(thread_handler, (1,1))
while True:
time.sleep(5)
相关文章推荐
- c++第二次试验
- windows平台zend optimizer安装配置
- ubuntu server 改变phpmyadmin的默认访问路径
- 牛客网每日打卡-Java基础-20160331
- 每天一篇python:时间日期篇
- Spring
- 20145208 《Java程序设计》第5周学习总结
- C++第二次作业
- ASP.NET Core环境并运行 继续跨平台
- 一起talk C栗子吧(第一百三十四回:C语言实例--创建线程时的内存细节)
- C# DllImport“调用导致堆栈不对称。原因可能是托管的 PInvoke 签名与非托管的目标签名不匹配。请检查 PInvoke 签名的调用约定和参数与非托管的目标签名是否匹配 ”
- eclipse快捷键
- python之邮件操作
- C++作业2:距离
- java基础学习05(面向对象基础02)
- c++第二次实验
- python语言学习1——初识python
- Java中的线程池
- javase学习笔记16.3.31
- php笔记2 连接数据库