您的位置:首页 > 其它

爬虫常用的三种通用模板小结

2017-07-18 15:17 477 查看
确实有一段时间没怎么写爬虫了,最近又安排了写爬虫的任务,其实很多东西写过一份,之后再做大部分是复制粘贴代码,然后修改一下。所以这里总结一下通用的地方,之后编写爬虫的时候就可以直接拿来用了。

一、使用urllib2库

对于一些简单的网站,需要获取的信息又是以json格式返回时,我一般喜欢用urllib2库直接写爬虫获取。
 
代码模板:

import urllib2
import urllib
import json

requrl = "http://www.baidu.com"  # 输入要抓取的网站

# 如果有要post给requrl的数据,以dict的形式
post_data = {'pageIndex':1, 'pagesize':12}
post_data_urlencode = urllib.urlencode(post_data)

req = urllib2.Request(url=requrl, data=post_data_urlencode)
res_data = urllib2.urlopen(req)
res = res_data.read()   # 这里返回的是response中的一个json格式串,res是str类型

# json.dumps:dict转成str
# json.loads: str转成dict
# 将数据转换为dict,方便取出数据
json_data = json.loads(res)


二、使用selenium

from selenium import webdriver

# 使用PhantomJS就不会在每次启动的时候都弹出浏览器了
driver = webdriver.PhantomJS()
requrl = "http://www.baidu.com"
driver.get(requrl)

# 通过xpath获取到想要的元素
elements = driver.find_elements_by_xpath('//div[@class="wd"]')
for element in elements:
next_url = element.get_attribute("href")
# 这里获取到下一个需要抓取的网址后,就可以进入下一个函数了

driver.quit()


三、使用scrapy
spiders文件夹下自己创建一个.py文件,代码如下:

from scrapy.spiders import Spider
from hospital.items import hospital_301
# hospital是整个工程的文件名, hospital_301是你要传入的item的类名
from scrapy.selector import Selector
import scrapy

# hospital_spider这个类名可改
class hospital_spider(Spider):
# 爬虫的名字,用于启动爬虫的时候用
# 启动爬虫命令:scrapy crawl 爬虫名字
name = "301hospital"
# allowed_domains = ['http://www.301hospital.com.cn']
start_urls = ["http://www.301hospital.com.cn/web/expert/myhc/yyzj.html"]

def parse(self, response):
sel = Selector(response)

# 取一个链接
elements = sel.xpath("//div[@class="keshiMenu"]//a/@href").extract()
for element in elements:
if element == '?':
pass
else:
# 拼接起来
next_url = "http://www.301hospital.com.cn/" + element
# 进入下一个函数
yield scrapy.Request(url, callback=self.parse_department)

# 若有item传入则使用下面这个形式
# yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_detail)

def parse_detail(self, response):
.....
# 处理完直接return item,pipeline即可收到这个item
item1 = response.meta['item']
return item1


items.py文件代码如下:

from scrapy import Item, Field

class HospitalItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

class hospital_301(Item):
name = Field()  # 专家名字
title = Field()  # 职称
department = Field()  # 科室
introduction = Field()  # 详细介绍
specialty = Field()  # 医师特长
visit_info = Field()  # 出诊信息
photo = Field()  # 照片
link = Field()


pipeline.py代码如下:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 
import MySQLdb.cursors
from twisted.enterprise import adbapi
from hospital.items import hospital_301  # hospital为工程文件夹的名字,hospital_301为要传入item的名字
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
import chardet

SETTINGS = get_project_settings()

class HospitalPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)

def __init__(self, stats):
# Instantiate DB
self.dbpool = adbapi.ConnectionPool('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode=True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)

def spider_closed(self, spider):
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()

def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item

def _insert_record(self, tx, item):
name = ""
title = ""
department = ""
introduction = ""
specialty = ""
visit_info = ""
photo = ""
# 进行元素的获取,都使用try的形式,不然如果有一个元素为空,取出报错,整个程序会中断
try:
name = str(item['name']).decode('raw_unicode_escape').replace("[u'姓名:", "").replace("']", "")
except:
pass
try:
title = item['title'][0]
except:
pass
try:
department = item['department'][0].replace("科室:", "")
except:
pass

try:
introduction = item['introduction'][0]
except:
pass

try:
specialty = item['specialty'][0]
except:
pass

try:
visit_info = ''.join(item['visit_info'])
except:
pass

try:
photo = str(item['photo']).decode('raw_unicode_escape')
except:
pass

# 这一步有时候可删除,需要加上的原因是有时候中文存入数据库会出现乱码的情况
name = name.encode('utf-8')
title = title.encode('utf-8')
department = department.encode('utf-8')
introduction = introduction.encode('utf-8')
specialty = specialty.encode('utf-8')
visit_info = visit_info.encode('utf-8')
photo = photo.encode('utf-8')

# 用于调试用
# print "name--", name
# print "title--", title
# print "department--", department
# print "introduction--", introduction
# print "specialty--", specialty
# print "visit_info--", visit_info
# print "photo--", photo

sql = "INSERT INTO hospital_301 VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \
(name, title, department, introduction, specialty, visit_info, photo)

# print sql
tx.execute(sql)
print "yes"

def _handle_error(self, e):
log.err(e)


settings.py代码如下:

# -*- coding: utf-8 -*-

BOT_NAME = 'hospital'

SPIDER_MODULES = ['hospital.spiders']
NEWSPIDER_MODULE = 'hospital.spiders'

COOKIES_ENABLED = False

DOWNLOAD_DELAY = 7  # 下载延迟时间

LOG_LEVEL = 'INFO'

# 数据库参数
DB_HOST = 'localhost'
DB_PORT = 3306  # 端口
DB_USER = 'root'	# 账号
DB_PASSWD = 'xxx' # 密码
DB_DB = 'hospitals'	# 数据库名

ITEM_PIPELINES = {
'hospital.pipelines.HospitalPipeline': 300,  # hospital用工程的文件名替换
}

如需要下载图片,代码如下:

photo_url = "http://www.hfyy.cn/bbyy/upload/2015-5/2015052660742901.jpg"
# 给定图片存放名称
filename = 'd:\\photos\\' + name + '.jpg'
# 文件名是否存在
try:
urllib.urlretrieve(photo_url, filename)
print "finished"
except Exception, e:
print e
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: