您的位置:首页 > 其它

学习笔记-分布式数据抓取

2017-09-05 11:40 218 查看
远程过程调用(Remote Procedure Call,RPC)是在本地调用远程方法的技术。使用RPC技术可以方便的进行分布式数据抓取。Python自带RPC模块,xmlrpc。以下实例均在Python3.5.3上测试通过。

基本思路如下:在2台阿里云服务器部署RPC服务端,文件名为rpc_server.py。这两台阿里云服务器均为具有独立公网IP的数据抓取节点。而在本机部署RPC客户端,文件名为rpc_client.py。抓取数据的具体代码均在rpc_client.py中实现。

#rpc_server.py代码如下



from xmlrpc.server import SimpleXMLRPCServer
import socketserver
import requests
import sys

class Crawler:

def get(self,user,url,params=None,headers=None):
try:
if user=='username':
r=requests.get(url,params=params,headers=headers)
return r.text
else:
return ''
except Exception as e:
return e

def post(self,user,url,data=None,headers=None):
try:
if user=='username':
r=requests.post(url,data=data,headers=headers)
return r.text
else:
return ''
except Exception as e:
return e

if __name__=='__main__':
ip=sys.argv[1]
port=sys.argv[2]
class RPCTheading(socketserver.ThreadingMixIn,SimpleXMLRPCServer):
pass

crawler_object=Crawler()
server=RPCTheading((ip,int(port)))
server.register_instance(crawler_object)
print("listening")
server.serve_forever()

#rpc_client.py代码如下:



# coding:utf-8

from xmlrpc.client import ServerProxy
from bs4 import BeautifulSoup
import threading
import time

#读取ip配置文件,返回ip列表
def get_server_ips(sfile):
server_ips=[]
with open(sfile,'r',encoding='utf8') as f:
for ind,line in enumerate(f.readlines()):
ip=line.strip()
server_ips.append([ind,ip])

return server_ips

def format_str(str):
return str.replace("\n","").replace(" ","").replace("\t","")

#提取title和href
def get_urls_in_pages(from_page_num,to_page_num,remote_ip):
server=ServerProxy(remote_ip[1])
urls=[]
search_word='计算机'
url_part_1='http://www.phei.com.cn/module/goods/'\
'searchkey.jsp?Page='
url_part_2='&Page=2&searchKey='

for i in range(from_page_num,to_page_num+1):
urls.append(url_part_1+str(i)+url_part_2+search_word)

all_href_list=[]

for url in urls:
print(remote_ip[1],url)
rstr=server.get('username',url,{},{})
bs=BeautifulSoup(rstr)
a_list=bs.find_all('a')
needed_list=[]
for a in a_list:
if 'href' in a.attrs:
href_val=a['href']
title=a.text
if 'bookid' in href_val and 'shopcar0.jsp' not in href_val and title!='':
if [title,href_val] not in needed_list:
needed_list.append([format_str(title),format_str(href_val)])
all_href_list+=needed_list

all_href_file = open(str(from_page_num)+'_'+str(to_page_num)+'_'+'all_hrefs.txt','w')

for href in all_href_list:
all_href_file.write('\t'.join(href)+'\n')

all_href_file.close()

print(from_page_num,to_page_num,len(all_href_list))

def multiple_threads_test():
server_ips=get_server_ips('remotesip.txt')
server_cnt=len(server_ips)
t1=time.time()
page_ranges_lst=[
(1,10),
(11,20),
(21,30),
(31,40),
]

th_lst=[]

for ind,page_range in enumerate(page_ranges_lst):
th=threading.Thread(target=get_urls_in_pages,args=(page_range[0],page_range[1],server_ips[ind % server_cnt]))

th_lst.append(th)

for th in th_lst:
th.start()

for th in th_lst:
th.join()

t2=time.time()
print("用时:",t2-t1)

return t2-t1

if __name__=='__main__':
mt=multiple_threads_test()
print("mt",mt)
remotesip.txt中是存放的4个阿里云服务器节点,内容如下:



http://www.dosec.io:10001 http://www.zhainanbang.net:10001[/code] 

在阿里云服务器上运行rpc_server.py

python3 rpc_server.py www.zhainanbang.net 10001

python3 rpc_server.py www.dosec.io 10001







在本机运行rpc_client.py


两个数据抓取节点的状态如下:


                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐