您的位置:首页 > 其它

爬取知名社区技术文章_pipelines_4

2017-07-25 12:02 211 查看
获取字段的存储处理和获取普通的路径

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import pymysql
import gevent
import pymysql
from gevent import monkey
from scrapy.pipelines.images import ImagesPipeline
import pymysql.cursors

class JobboleImagerPipeline(ImagesPipeline):
"""
获得图片下载路径
"""
def item_completed(self, results, item, info):
if 'img_url' in item:
for key, value in results:
# print(key)
img_path = value['path']
# print(value['path'])
item['img_path'] = img_path
return item

# class SqlSave(object):
#     """常规同步方式存入数据库"""
#     def __init__(self):
#         SQL_DBA = {
#             'host': 'localhost',
#             'db': 'jobole',
#             'user': 'root',
#             'password': 'password',
#             'use_unicode': True,
#             'charset': 'utf8'
#         }
#         self.conn = pymysql.connect(**SQL_DBA)
#         self.cursor = self.conn.cursor()
#
#     def process_item(self, item, spider):
#         sql = self.get_sql(item)
#         print(sql)
#         self.cursor.execute(sql)
#         self.conn.commit()
#
#         return item
#
#     def get_sql(self, item):
#         sql = """insert into article(cont_id, cont_url, title, publish_time, cont, img_url, img_path, like_num, collection_num, comment_num) value ('%s','%s','%s','%s','%s','%s','%s', %d, %d, %d)
#         """ % (item['cont_id'], item['cont_url'],item['title'],item['publish_time'],item['cont'],item['img_url'][0],item['img_path'],item['link_num'],item['collection_num'],item['comment_num'],)
#         return sql

class SqlSave(object):
"""
协程方式向数据库插入数据
"""

def __init__(self):
# 初始数据库连接和参数,SQL_DBA可写在setting中,通过 获取在settings.py中设置的SQL_DBA字典
# @classmethod
# def from_settings(cls, settings):
#     sql_dba = settings[SQL_DBA]
#     return cls(cls,sql_dba)           需要__init__中新添个参数接收这个值
SQL_DBA = {
'host': 'localhost',
'db': 'jobole',
'user': 'root',
'password': 'password',
'use_unicode': True,
'charset': 'utf8'
}
self.conn = pymysql.connect(**SQL_DBA)
self.cursor = self.conn.cursor()

def process_item(self, item, spider):
sql = self.__get_sql(item)
# 协程方式对数据库插入操作
gevent.joinall([
gevent.spawn(self.__go_sql, self.cursor, self.conn, sql, item),
])
return item

def __go_sql(self, cursor, conn, sql, item):
try:
# 数据库插入操作
cursor.execute(sql,
(item['cont_id'], item['cont_url'], item['title'], item['publish_time'],
item['cont'], item['img_url'][0], item['img_path'], item['link_num'],
item['collection_num'], item['comment_num']))
conn.commit()
except Exception as e:
print(e)

def __get_sql(self, item):
# 生成sql语句
sql = """insert into
article(cont_id, cont_url, title, publish_time,
cont, img_url, img_path, like_num,
collection_num, comment_num)
value
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
return sql


  
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: