您的位置:首页 > 编程语言 > Python开发

Python - Selenium + Chromedriver + Wish

2016-06-30 10:06 399 查看
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 28 21:51:02 2016

@author: wzhao3
"""
import os
import xlrd
import random
import time
import datetime
import xlsxwriter
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
os.chdir("C:\\Temp")

import MySQLdb
##################################################################
#### Read tag ID list
workbook=xlrd.open_workbook('2016-06-28 08-50-15.xlsx')
len(workbook.sheet_names()) ## 13

id_list=[]

for i in range(len(workbook.sheet_names())):
id_list.extend(workbook.sheet_by_index(i).col_values(0)[1:])

len(id_list) ## 2724
len(set(id_list)) ## 2376

id_list=list(set(id_list))
id_list=id_list[:1000]

##################################################################
#### Loop over 200 IDs each time
my_headers = [
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
#因篇幅关系,此处省略N条
]

print "Start time is : "+ str(datetime.datetime.now())
ts = time.clock()
dt=datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H-%M-%S')
workbook = xlsxwriter.Workbook('ID_'+dt+'.xlsx')

loop_cnt=200

for id_num in range( len(id_list) /loop_cnt + 1):
print "ID_num= "+str(id_num)
tag_ids=id_list[ id_num*loop_cnt:(id_num+1)*loop_cnt ]
if len(tag_ids)==0:
continue
random_header = random.choice(my_headers)

######### Wish login
options = webdriver.ChromeOptions()
options.add_argument('--user-agent=%s'%random_header)
driver = webdriver.Chrome(chrome_options=options) # Get local session of firefox
driver.get('https://www.wish.com/')

## Email Login
driver.find_element_by_xpath("//*[@id=\"signup-form\"]/div[6]").click()
## Input email & password
driver.find_element_by_xpath("//*[@id=\"login-email\"]").send_keys("cmes1988@163.com")
driver.find_element_by_xpath("//*[@id=\"login-password\"]").send_keys("film007love")

## Click login
driver.find_element_by_xpath("//*[@id=\"email-login-form\"]/button").click()
driver.implicitly_wait(10)
time.sleep(3)
######### Wish login

Items=[['external_url','num_extra_photos', 'num_bought', 'variations', 'price', 'shipping_price', 'brand', 'name', 'generation_time', 'tags_number', 'tags', 'description']]
######### Loop over each tag
for i in range(len(tag_ids)):
print "Tag num= "+str(i)
item=[]
url=tag_ids[i]
driver.get(url)
time.sleep(3)

d=driver.execute_script("return pageParams;")
if 'mainContestObj' not in d.keys():
print url+ "has been removed"
continue

d=d['mainContestObj']

item.append(d['external_url']) ## url
item.append(d['num_extra_photos']) ## photos number
item.append(d['num_bought']) ## number bought
item.append(len(d['commerce_product_info']['variations'])) ## variation number
item.append(d['commerce_product_info']['variations'][0]['original_price']) ## original price
item.append(d['commerce_product_info']['variations'][0]['original_shipping']) ## original shipping
item.append(d['brand']) ## brand
item.append(d['name']) ## name
item.append(d['generation_time']) ## generation time
item.append(len(d['tags'])) ## tags number

tag=''
for j in range(len(d['tags'])):
if j==1:
tag=unicode(d['tags'][j]['name'])
else:
tag=tag+', '+unicode(d['tags'][j]['name'])
item.append(tag) ## tag
item.append(d['description'])

for j in d['extra_photo_urls'].keys():
item.append(d['extra_photo_urls'][j])

Items.append(item)
## End for

## Output to Excel
worksheet = workbook.add_worksheet('Sheets_'+str(id_num))
for ij in range(len(Items)):
worksheet.write_row('A'+str(ij+1), Items[ij])
worksheet.set_column('A:A', 25)

driver.close()
time.sleep(5)

## End for id_num
workbook.close()
print "End time is : "+ str(datetime.datetime.now())
te = time.clock()
print "%f seconds for program" %(te-ts)

"""
fo = open("foo.txt", "wb")
fo.write( str(d));
fo.close()

tag=''

for j in range(len(d['tags'])):
if j==1:
tag=str(d['tags'][j]['name'])
else:
tag=tag+', '+str(d['tags'][j]['name'])

a='--user-agent=%s'%random_header

##### Wish login
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions()
options.add_argument('--user-agent=%s'%random_header)
driver = webdriver.Chrome(chrome_options=options) # Get local session of firefox

driver.get(url)
driver.refresh()
driver.implicitly_wait(30)

## driver.get_cookies()

driver.get('https://www.wish.com/')

## Email Login
driver.find_element_by_xpath("//*[@id=\"signup-form\"]/div[6]").click()
## Input email & password
driver.find_element_by_xpath("//*[@id=\"login-email\"]").send_keys("cmes1988@163.com")
driver.find_element_by_xpath("//*[@id=\"login-password\"]").send_keys("film007love")

## Click login
driver.find_element_by_xpath("//*[@id=\"email-login-form\"]/button").click()

driver.get(url)

mainContestObj=driver.execute_script("return pageParams;")['mainContestObj']
type(mainContestObj)

mainContestObj.keys()

## Download photos
img_counter = 0
for img_link in img_links:
img_name = '%s.jpg' % img_counter
urllib.urlretrieve(img_link, "//Users//Sean//Downloads//tieba//%s" %img_name)
img_counter += 1

d['is_clean']
len(d['extra_photo_urls'])
d['num_extra_photos']

d
d['num_bought']

len(d['commerce_product_info']['variations']) ## 8
d['commerce_product_info']['variations'][0]

d['commerce_product_info']['variations'][0]['original_price']
d['commerce_product_info']['variations'][0]['return_policy_long']
d['commerce_product_info']['variations'][0]['shipping_before_personal_price'] ## 8
d['commerce_product_info']['variations'][0]['original_shipping'] ## 6

d['commerce_product_info']['variations'][0]['return_policy_long']
d['commerce_product_info']['variations'][0]['return_policy_long']

d['description']
d['generation_time']

d['tags']
len(d['tags']) ## 14
d['tags'][0]['name']

d['brand']
d['name']
d['value'] ## 245
d['external_url']

d['is_clean']
d['is_clean']
d['is_clean']
d['is_clean']
d['is_clean']
d['is_clean']
d['is_clean']

driver.close()
"""

"""
import requests

a=requests.get('https://contestimg.wish.com/api/webimage/565584faae31864d490cc0f4-10-small.jpg')
with open('a.jpg', "wb") as fo:
fo.write(a.content)

"""

"""

import urllib2
import re
import os
import json
from bs4 import BeautifulSoup

response=urllib2.urlopen('https://www.wish.com/c/565584faae31864d490cc0f4')
html=response.read()
response.close()

print response.getcode()
## 输出为403,代表拒绝访问;同理200表示请求成功完成;404表示网址未找到.

soup = BeautifulSoup(html)

b=[x.extract() for x in soup.find_all( 'script' )][14]
pattern = re.compile(r'{.+}')
c=pattern.search(str(b)).group()
d=json.loads(c)

# 可以通过print random_header查看提交的header信息
req = urllib2.Request(url)
req.add_header("User-Agent", random_header)
req.add_header('Host', 'www.wish.com')
req.add_header('Referer', 'https://www.wish.com/c/565584faae31864d490cc0f4')
req.add_header('GET', url)
content = urllib2.urlopen(req).read()

fo = open("foo.txt", "wb")
fo.write( str(d));
fo.close()

soup.title
soup.p
soup.a['href']

soup.find_all('b')
soup.head

print soup.find_all('title')
print soup.find_all('a')
print soup.find_all(id="link2")
print soup.find_all(id=True)

print soup.find_all("a", class_="sister")
print soup.select("p.title")

#通过属性进行查找
print soup.find_all("a", attrs={"class": "sister"})

#通过文本进行查找
print soup.find_all(text="Elsie")
print soup.find_all(text=["Tillie", "Elsie", "Lacie"])

#限制结果个数
print soup.find_all("a", limit=2)

soup.find_all(id="footer-like")

a=[x.extract() for x in soup.find_all( 'script' )]
print a[14]

type(a) ## list
type(a[14]) # bs4.element.Tag
b=a[14]

print str(b)
len(str(b))

pattern = re.compile("{*}")
res = pattern.search(str(b)).groups()
print res
re.findall("{*}",a)

pattern = re.compile(r'{.+}')

c=pattern.search(str(b)).group()

d=json.loads(c)
type(d)
d.keys()
d['is_clean']

# 将正则表达式编译成Pattern对象
pattern = re.compile(r'hello')

# 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
match = pattern.match('hello world!')

match.group()
"""
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python selenium