您的位置:首页 > 编程语言 > Python开发

爬虫学习笔记-Scrapy初识

2016-10-29 00:40 387 查看
Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。Scrapy吸引人的地方在于它是一个框架,任何人都可以根据需求方便的修改。它也提供了多种类型爬虫的基类。

基于Python2.7安装Scrapy:

pip install scrapy

pip install scrapy-redis

如果安装过程中提示缺少库,则直接安装指定库就好了

Scrapy安装成功后,可以开始爬取工作,网上有很多Scrapy框架工作原理的介绍,这里就不陈述了,直接上实例。

新建一个Scrapy项目:

scrapy startproject qqgroup

修改settings.py文件,这里应用了一些简单的攻防反爬策略,如User-Agent、IP的随机动态切换,实例中我用的是基于Redis的调度器,数据存储在MongoDB

# -*- coding: utf-8 -*-

# Scrapy settings for qqgroup_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 
BOT_NAME = 'qqgroup_crawler'

SPIDER_MODULES = ['qqgroup_crawler.spiders']
NEWSPIDER_MODULE = 'qqgroup_crawler.spiders'

RETRY_TIMES = 3
RETRY_HTTP_CODES = [404, 408, 500, 502, 503, 504]

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'qqgroup_crawler (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

REACTOR_THREADPOOL_MAXSIZE = 10

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = {
#    'qqgroup_crawler.middlewares.MyCustomSpiderMiddleware': 543,
#}

DOWNLOADER_MIDDLEWARES = {
'mobile_crawler.middlewares.UserAgentMiddleware': 1,
'mobile_crawler.middlewares.DynamicProxyMiddleware': 80,
'mobile_crawler.middlewares.StaticProxyMiddleware': 100,
'mobile_crawler.middlewares.SeleniumProxyMiddleware': 120,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 140,
}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = {
#    'qqgroup_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {
'qqgroup_crawler.pipelines.ConsolePipeline': 200,
'qqgroup_crawler.pipelines.MongoDBPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = 'redis://192.168.0.1:6379'
REDIS_HOST = '192.168.0.1'
REDIS_PORT = 6379
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

USER_AGENTS = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Dalvik/2.1.0 (Linux; U; Android 5.0.2; Redmi Note 2 MIUI/V7.5.5.0.LHMCNDE",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]

PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
}

修改pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 
import pymongo
import traceback

class ConsolePipeline(object):

def process_item(self, item, spider):
print item
return item

class MongoDBPipeline(object):

def open_spider(self, spider):
self.client = pymongo.MongoClient(host='192.168.0.21', port=27017)
self.db = self.client['test']
self.db.authenticate('root', '123456')

def close_spider(self, spider):
self.client.close()

def process_item(self, item, spider):
try:
self.db['qqgroup'].insert(dict(item))
except pymongo.errors.DuplicateKeyError:
pass
except Exception, e:
print e.message
print traceback.format_exc()
return item
这里要用到操作mongodb的库,所以需要安装:pip install pymongo

修改items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html 
import scrapy

class QQGroupItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
_id = scrapy.Field()
groupCode = scrapy.Field()
groupFlag = scrapy.Field()
groupLevel = scrapy.Field()
groupMemNum = scrapy.Field()
groupMaxMem = scrapy.Field()
groupOwner = scrapy.Field()
groupName = scrapy.Field()
groupIntro = scrapy.Field()
groupTags = scrapy.Field()
groupClass = scrapy.Field()
groupClass2 = scrapy.Field()
groupClassText = scrapy.Field()
pass

修改middlewares.py文件

# -*- coding: utf-8 -*-

import os
import re
import sys
import json
import redis
import random
import base64
import requests
import platform
from bs4 import BeautifulSoup
from selenium import webdriver
from settings import REDIS_HOST, REDIS_PORT

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding(default_encoding)

class UserAgentMiddleware(object):

def __init__(self, agents):
self.agents = agents

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.getlist('USER_AGENTS'))

def process_request(self, request, spider):
user_agent = random.choice(self.agents)
print "**********User-Agent: " + user_agent
request.headers.setdefault('User-Agent', user_agent)

class QueueProxyMiddleware(object):

def __init__(self):
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

def process_request(self, request, spider):

proxies_data = self.redis_client.get('QUEUE_PROXIES')
if proxies_data:
proxies = json.loads(proxies_data)
else:
proxies = []
proxies_queue = self.redis_client.zrange('proxy_id_queue', 0, 200)
for ip_port in proxies_queue:
proxy = {}
proxy['ip_port'] = str(ip_port).strip().replace(' ', '')
proxy['user_pass'] = ''
proxies.append(proxy)
self.redis_client.set('QUEUE_PROXIES', json.dumps(proxies), 180)

proxy = random.choice(proxies)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
print "**************QueueProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************QueueProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = "http://%s" % proxy['ip_port']

class StaticProxyMiddleware(object):

def __init__(self):
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

def process_request(self, request, spider):

proxies_data = self.redis_client.get('STATIC_PROXIES')
if proxies_data:
proxies = json.loads(proxies_data)
else:
proxies = []
proxies_01 = static_crawl_proxy360_proxy_ip()
proxies_02 = static_crawl_xicidaili_proxy_ip()
proxies.extend(proxies_01)
proxies.extend(proxies_02)
self.redis_client.set('STATIC_PROXIES', json.dumps(proxies), 300)

proxy = random.choice(proxies)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
print "**********StaticProxyMiddleware have pass**********" + proxy['ip_port']
else:
print "**********StaticProxyMiddleware no pass**********" + proxy['ip_port']
request.meta['proxy'] = "http://%s" % proxy['ip_port']

class DynamicProxyMiddleware(object):

def __init__(self):
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

def process_request(self, request, spider):

proxies_data = self.redis_client.get('DYNAMIC_PROXIES')
if proxies_data:
proxies = json.loads(proxies_data)
else:
proxies = dynamic_crawl_goubanjia_proxy_ip()
self.redis_client.set('DYNAMIC_PROXIES', json.dumps(proxies), 60)

proxy = random.choice(proxies)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
print "**********DynamicProxyMiddleware have pass**********" + proxy['ip_port']
else:
print "**********DynamicProxyMiddleware no pass**********" + proxy['ip_port']
request.meta['proxy'] = "http://%s" % proxy['ip_port']

class SeleniumProxyMiddleware(object):

def __init__(self):
self.redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

def process_request(self, request, spider):

proxies_data = self.redis_client.get('SELENIUM_PROXIES')
if proxies_data:
proxies = json.loads(proxies_data)
else:
proxies = []
goubanjia_proxies = selenium_crawl_goubanjia_proxy_ip()
proxies.extend(goubanjia_proxies)
xicidaili_proxies = selenium_crawl_xicidaili_proxy_ip()
proxies.extend(xicidaili_proxies)
self.redis_client.set('SELENIUM_PROXIES', json.dumps(proxies), 300)

proxy = random.choice(proxies)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
print "**********SeleniumProxyMiddleware have pass**********" + proxy['ip_port']
else:
print "**********SeleniumProxyMiddleware no pass**********" + proxy['ip_port']
request.meta['proxy'] = "http://%s" % proxy['ip_port']

def static_crawl_proxy360_proxy_ip():
url = 'http://www.proxy360.cn/default.aspx'
response = requests.get(url)
html = BeautifulSoup(response.text, 'html.parser')
proxies = []
ip_port = ''
span_tags = html.select('div.proxylistitem span.tbBottomLine')
for span_tag in span_tags:
if span_tag.has_attr('style'):
style_value = span_tag.get('style')
if style_value == 'width:140px;':
ip_port = ip_port + span_tag.string.strip()
elif style_value == 'width:50px;':
ip_port = ip_port + ':' + span_tag.string.strip()
proxy = {}
proxy['ip_port'] = ip_port
proxy['user_pass'] = ''
proxies.append(proxy)
ip_port = ''
return proxies

def static_crawl_goubanjia_proxy_id():
url = 'http://www.goubanjia.com/'
req_session = requests.session()
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Referer": "http://www.baidu.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
response = req_session.get(url, headers=headers)
html = BeautifulSoup(response.text, 'html.parser')
proxies = []
td_tags = html.select('table.table tr td')
ip_port = ''
for td_tag in td_tags:
if td_tag.has_attr('class'):
class_value = td_tag.get('class')
if class_value[0] == 'ip':
td_tag_all_tags = td_tag.contents
ip = ''
for td_tag_tag in td_tag_all_tags:
if td_tag_tag.has_attr('style'):
style_name = td_tag_tag.get('style').strip().replace(' ', '')
if style_name and (style_name == 'display:inline-block;'):
if td_tag_tag.string:
ip = ip + td_tag_tag.string
else:
if td_tag_tag.string:
ip = ip + td_tag_tag.string
print ip
ip_port = ip_port + ip
else:
print td_tag
print td_tag.string
ip_port = ip_port + ':' + td_tag.string
proxy = {}
proxy['ip_port'] = ip_port
proxy['user_pass'] = ''
proxies.append(proxy)
ip_port = ''
print proxies

def static_crawl_xicidaili_proxy_ip():
url = 'http://api.xicidaili.com/free2016.txt'
response = requests.get(url)
pattern = '(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}):(\\d+)'
ip_port_array = re.findall(pattern, response.text)
proxies = []
for ip_port in ip_port_array:
proxy = {}
proxy['ip_port'] = ip_port[0] + ':' + ip_port[1]
proxy['user_pass'] = ''
proxies.append(proxy)
return proxies

def dynamic_crawl_goubanjia_proxy_ip():
proxies = []
ips = set()
order_id = 'a66cff43be83d8f1c3724945ded69549'
for i in xrange(100):
url = 'http://dynamic.goubanjia.com/dynamic/get/' + order_id + '.html?ttl'
response = requests.get(url)
datas = str(response.text).split(':')
port_time = datas[1].split(',')
if datas[0] not in ips:
ips.add(datas[0])
proxy = {}
proxy['ip_port'] = datas[0] + ':' + port_time[0].strip()
proxy['user_pass'] = ''
proxies.append(proxy)
return proxies

def selenium_crawl_goubanjia_proxy_ip():
parent_dir = os.path.dirname(__file__)  # 获取当前文件夹的绝对路径
current_operation_system = platform.system()
if current_operation_system == 'Windows':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver.exe')
elif current_operation_system == 'Linux':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver')
print driver_file_path

chrome_driver = os.path.abspath(driver_file_path)
os.environ['webdriver.chrome.driver'] = chrome_driver

if current_operation_system == 'Windows':
browser = webdriver.Chrome(chrome_driver)
elif current_operation_system == 'Linux':
service_log_path = "{}/chromedriver.log".format(chrome_driver)
service_args = ['--verbose']
browser = webdriver.Chrome(chrome_driver, service_args=service_args, service_log_path=service_log_path)

browser.get("http://www.goubanjia.com/")
ips = []
ip_elements = browser.find_elements_by_css_selector('table.table tr td.ip')
for ip_element in ip_elements:
ips.append(ip_element.text)
ports = []
port_elements = browser.find_elements_by_css_selector('table.table tr td.port')
for port_element in port_elements:
ports.append(port_element.text)
proxies = []
for i in xrange(len(ips)):
proxy = {}
proxy['ip_port'] = ips[i] + ':' + ports[i]
proxy['user_pass'] = ''
proxies.append(proxy)

browser.close()
browser.quit()
return proxies

def selenium_crawl_xicidaili_proxy_ip():
parent_dir = os.path.dirname(__file__)  # 获取当前文件夹的绝对路径
current_operation_system = platform.system()
if current_operation_system == 'Windows':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver.exe')
elif current_operation_system == 'Linux':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver')
print driver_file_path

chrome_driver = os.path.abspath(driver_file_path)
os.environ['webdriver.chrome.driver'] = chrome_driver

if current_operation_system == 'Windows':
browser = webdriver.Chrome(chrome_driver)
elif current_operation_system == 'Linux':
service_log_path = "{}/chromedriver.log".format(chrome_driver)
service_args = ['--verbose']
browser = webdriver.Chrome(chrome_driver, service_args=service_args, service_log_path=service_log_path)

proxies = []
for i in xrange(11):
target_url = "http://www.xicidaili.com/nt/" + str(i)
browser.get(target_url)
tr_elements = browser.find_elements_by_css_selector('tr.odd')
for tr_element in tr_elements:
datas = str(tr_element.text).split(' ')
proxy = {}
proxy['ip_port'] = datas[0] + ':' + datas[1]
proxy['user_pass'] = ''
proxies.append(proxy)

browser.close()
browser.quit()
return proxies

修改qqgroupspider.py文件

# -*- coding: utf-8 -*-

import os
import re
import sys
import json
import urllib
import scrapy
from qqgroup_crawler.items import QQGroupItem
from scrapy_redis.spiders import RedisCrawlSpider

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding(default_encoding)

class QQGroupSpider(RedisCrawlSpider):

name = "qqgroup_spider"
allowed_domains = ["qq.com"]
redis_key = 'qqgroup_spider:qqgroup'

def parse(self, response):
json_obj = json.loads(response.body)
error_code = json_obj['ec']
if error_code == 0:
group_list = json_obj['gList']
for group in group_list:
qq_group_item = QQGroupItem()
qq_group_item['_id'] = group['gc']
qq_group_item['groupCode'] = group['gc']
qq_group_item['groupFlag'] = group['gFlag']
qq_group_item['groupLevel'] = group['gLevel']
qq_group_item['groupMemNum'] = group['gMemNum']
qq_group_item['groupMaxMem'] = group['gMaxMem']
qq_group_item['groupOwner'] = group['gOwner']
qq_group_item['groupName'] = group['gName'].strip()
qq_group_item['groupIntro'] = group['gIntro'].strip()
qq_group_item['groupTags'] = group['gTags'].strip()
qq_group_item['groupClass'] = group['gClass']
qq_group_item['groupClass2'] = group['gClass2'].strip()
qq_group_item['groupClassText'] = group['gClassText'].strip()
yield qq_group_item
is_end_flag = json_obj['IsEnd']
if is_end_flag == 1:
print 'current url pagination has finished!'
else:
current_url = response.url
regex = 'p=\d+'
matcher = re.compile(regex).search(current_url)
if matcher:
page_string = matcher.group()
page_tag = page_string[0:page_string.index('=') + 1]
page_num = int(page_string[page_string.index('=') + 1:])
new_page_string = page_tag + str(page_num + 1)
next_url = current_url.replace(page_string, new_page_string)
cookies = response.request.meta['cookies']
meta = {'cookiejar': 1, 'cookies': cookies}
request = scrapy.Request(
url=next_url,
method='GET',
cookies=cookies,
meta=meta
)
yield request
else:
print 'not match'
else:
print json_obj['em']

def start_requests(self):
parent_dir = os.path.dirname(__file__)
zh_file_path = os.path.join(parent_dir, 'qqgroup_zh.txt')
keywords = []
with open(zh_file_path, 'r') as f:
line = f.readline()
while line:
keywords.append(line.strip())
line = f.readline()
en_file_path = os.path.join(parent_dir, 'qqgroup_en.txt')
with open(en_file_path, 'r') as f:
line = f.readline()
while line:
keywords.append(line.strip())
line = f.readline()
for keyword in keywords:
url = 'http://qqun.qq.com/cgi-bin/qun_search/search_group?k=%s&p=2&n=8&c=1&t=0&st=1&r=0.8119000566657633&d=1&bkn=825315115&v=0' %(urllib.quote(keyword))
cookie = 'xxxxxxxxxxxxxxxxxxx'
cookies = {}
items = cookie.split(';')
for item in items:
kv = item.split('=')
cookies[kv[0]] = kv[1]
# cookies = {'Cookie': cookie}
meta = {'cookiejar': 1, 'cookies': cookies}
request = scrapy.Request(
url=url,
method='GET',
cookies=cookies,
meta=meta
)
yield request

运行Scrapy项目

scrapy crawl qqgroup_spider

爬虫开始工作!!!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息