您的位置:首页 > 其它

scrapy爬虫学习

2017-10-27 16:42 197 查看
urllib模块解析编码url参数
from urllib import parse
postdata = {
'a': 1,
'b': 2
}
data = parse.urlencode(postdata)
print(data)

windows下爬虫脚本必须配置以下内容,否则出现编码错误
import sys,io
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

一、爬取煎蛋网内容

items.py    #数据字段
import scrapy

class JiandanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
img_url = scrapy.Field()

爬虫脚本jiandan.py
import sys,io
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
import scrapy
from ..items import JiandanItem
from scrapy.selector import HtmlXPathSelector

class JianDanSpider(scrapy.Spider):
name = "jiandan"

allowed_domains = ["jandan.net"]
start_urls = [
"http://jandan.net/",
]

def parse(self, response):
#title_list = response.xpath('//div[@class="indexs"]//h2/a/text()').extract()
hxs = HtmlXPathSelector(response)
items = hxs.select('//div[@class="post f list-post"]')
for item in items:
img_url = item.select('.//div[@class="thumbs_b"]/a/img/@data-original').extract_first()
if not img_url:
img_url = item.select('.//div[@class="thumbs_b"]/a/img/@src').extract_first()
img_url = img_url.strip("/")
img_url = "http://"+img_url
title = item.select('.//div[@class="indexs"]/h2/a/text()').extract_first()
content = item.select('.//div[@class="indexs"]/text()').extract()[3]
content = content.strip()
obj = JiandanItem(title=title, img_url=img_url, content=content)
yield obj

pipelines.py#数据存储脚本
import json
import os
import requests

# class JiandanPipeline(object):
#     def process_item(self, item, spider):
#         return item

class JsonPipeline(object):    #items.py设置的字段存储方式
def __init__(self):
self.file = open('jiandan.txt', 'w')

def process_item(self, item, spider):
v = json.dumps(dict(item), ensure_ascii=False)
self.file.write(v)
self.file.write('\n')
self.file.flush()
return item

class FilePipeline(object):    #图片url存储方式
def __init__(self):
if not os.path.exists('imgs'):
os.makedirs('imgs')
def process_item(self, item, spider):
response = requests.get(item['img_url'],stream=True)
with open('1.jpg', mode='wb') as f:
f.write(response.content)
return item

settings.py#设置存储
ITEM_PIPELINES = {
'jiandan.pipelines.JsonPipeline': 100,
'jiandan.pipelines.FilePipeline': 300,
}

命令行输出json文件: scrapy crawl jiandan -o items.json
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  scrapy 爬虫学习