您的位置:首页 > 运维架构 > 网站架构

用scrapy爬取网站数据,以api方式

2017-04-17 20:01 183 查看
# -*- coding: utf-8 -*-

import scrapy

import json

import re

from yiyao.items import YiyaoItem #引入Item

#Item和pipelines略过

class YiyaoSpiderSpider(scrapy.Spider):

name = "yiyao_spider" #爬虫名称

allowed_domains = ["210.73.89.76"] #域名

start_urls = ['http://210.73.89.76/'] #url列表

def parse(self, response): #scrpay解析函数

pro_name = ''#input("Please input name : ")

pro_comp = ''#input("Please input company : ")

pro_numb = "药" #url参数

main_url = "http://210.73.89.76/ServiceSelect/GetHosSelectList?BaseFlag=&OrgName=" + pro_comp + "&PermitNumber=" + pro_numb + "&ProductName=" + pro_name + "&filter=&group=&page=1&pageSize=100&sort="

yield scrapy.Request(url = main_url, callback = self.parse_getId) #callback为回调的函数

def parse_getId(self, response): #得到药品id的函数

# print(response.body)

main_data = json.loads(response.body.decode("utf-8"))["Data"] #返回的数据

main_count = json.loads(response.body.decode("utf-8"))["Total"] #总的数据个数

page_num = re.findall("page=(.*?)&",response.url)[0] #url中的页数 可以传递给下边的replace函数

page_size = re.findall("pageSize=(.*?)&",response.url)[0] #url中的页面大小

for eveData in main_data:

pro_id = eveData["ID"]

NAME_CHN = eveData["NAME_CHN"]

TRADE_NAME = eveData["TRADE_NAME"]

DOSEAGE_FORM_NAME = eveData["DOSEAGE_FORM_NAME"]

SPEC = eveData["SPEC"]

WRAP_NAME = eveData["WRAP_NAME"]

PERMIT_NUMBER = eveData["PERMIT_NUMBER"]

STAND_RATE = eveData["STAND_RATE"]

ORG_NAME = eveData["ORG_NAME"]

count_url = "http://210.73.89.76/ServiceSelect/GridOrgInfoList?OrgName=&OrgPrice=&ProductId=" + pro_id + "&filter=&group=&page=1&pageSize=100&sort=" #得到药品id之后需要获取对应的医院和价格

#把信息传递给后面的页面

main_desc = {"pro_id":pro_id,"NAME_CHN":NAME_CHN,"TRADE_NAME":TRADE_NAME,"DOSEAGE_FORM_NAME":DOSEAGE_FORM_NAME,"SPEC":SPEC,"WRAP_NAME":WRAP_NAME,"PERMIT_NUMBER":PERMIT_NUMBER,"STAND_RATE":STAND_RATE,"ORG_NAME":ORG_NAME}

yield scrapy.Request(url=count_url, callback=self.parse_getPrice, meta=main_desc) #meta的属性是字典,给url携带消息

if main_count > int(page_num) * int(page_size): #如果总的数据个数大于 页数乘页面大小 说明还有数据没有爬取完

new_page = int(page_num) + 1 #页数+1

new_main_url = str(response.url).replace("page="+page_num,"page="+str(new_page)) #scrapy.Request设置新的url

yield scrapy.Request(url=new_main_url, callback=self.parse_getId) #再次调取回调函数,得到商品id下所有医院和价格信息

def parse_getPrice(self, response): #得到医院和价格信息,并将上个页面得到的商品信息一起返回到Item

price_data = json.loads(response.body.decode("utf-8"))["Data"]

main_count = json.loads(response.body.decode("utf-8"))["Total"]

page_num = re.findall("page=(.*?)&", response.url)[0] #从url解析出页数(第几页)

page_size = re.findall("pageSize=(.*?)&", response.url)[0] #从url解析出每个页面大小

for eveData in price_data: #从数据中解析出全部信息

item = YiyaoItem() #此处为需要引入的Item,包含每条数据的全部信息

item['hos_id'] = eveData["ID"]

item['hos_name'] = eveData["NAME"]

item['hos_price'] = eveData["PRICE"]

item['pro_id'] = response.meta["pro_id"]

item['NAME_CHN'] = response.meta["NAME_CHN"]

item['TRADE_NAME'] = response.meta["TRADE_NAME"]

item['DOSEAGE_FORM_NAME'] = response.meta["DOSEAGE_FORM_NAME"]

item['SPEC'] = response.meta["SPEC"]

item['WRAP_NAME'] = response.meta["WRAP_NAME"]

item['PERMIT_NUMBER'] = response.meta["PERMIT_NUMBER"]

item['STAND_RATE'] = response.meta["STAND_RATE"]

item['ORG_NAME'] = response.meta["ORG_NAME"]

yield item

if main_count > int(page_num) * int(page_size): #如果id的总数大于当前页数乘页面大小,说明还有数据

new_page = int(page_num) + 1 #页数+1

new_price_url = str(response.url).replace("page="+page_num,"page="+str(new_page)) #替换成新的页数

yield scrapy.Request(url=new_price_url, callback=self.parse_getPrice, meta=response.meta) #回调parse_getPrice函数

徐朝晖 于4月17号,爬取http://210.73.89.76/ServiceSelect/GetServiceSelectList# 网站数据
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息