您的位置:首页 > 其它

vp_页面信息所有获取

2017-08-14 23:30 225 查看
#!/usr/bin/python
#encoding=utf-8
__author__ = 'henson'
from  bs4 import  BeautifulSoup
from prettytable import PrettyTable
import selenium
import sys
import urllib
import requests
import time
import re
import  csv
if __name__ == "__main__":
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  #添加高级搜索后的链接
driver.get('http://qikan.cqvip.com/zk/search.aspx?from=zk_search&key=U%3D%E7%94%9F%E6%80%81%E6%81%A2%E5%A4%8D%E4%B8%8E%E6%B2%BB%E7%90%86&size=50&page=1#search-result-list')
#inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
urlList=[]
#title = driver.find_element_by_xpath('//*[@id="body"]/div/div[3]/div[4]/dl[1]/dt/a').text
#print(title)

f = open("/home/henson/Downloads/vp/new/cloudy.csv", "a+", encoding='utf-8')
writer = csv.writer(f)
#writer.writerow(["title", "author", "cited", "fund"])
#x = PrettyTable(["number", "title", "author", "fund"])
for i in  range(1,300):
for j in range(1,51):
title = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl['+str(j)+']/dt/a').text
journal=driver.find_element_by_xpath('//*[@id="body"]/div/div[3]/div[4]/dl['+str(j)+']/dd[3]/span[1]').text
author = driver.find_eleme
aa8d
nt_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl['+str(j)+']/dd[3]/span[@class="writer"]').text
cited = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl['+ str(j)+']/dt/span[@class="cited"]').text

print(title)
print(journal)
print(author)
print(cited)

try:
driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[5]/a').click()  # 展开更多
fund = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[4]/span[@class="fund"]').text
abstract = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[5]/span').text
keyword = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[6]/span[@class="subject"]').text

print(abstract)
print(fund)
print(keyword)
except Exception:
try:
driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[4]/a').click()  # 展开更多
except Exception:
break;
fund=" "
abstract = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[4]/span[1]').text
keyword = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[5]/span[@class="subject"]').text
print(abstract)
print(fund)
print(keyword)

"""
try:
title = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dt/a').text
author = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dd[3]/span[@class="writer"]').text
cited = driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[4]/dl[' + str(j) + ']/dt/span[@class="cited"]').text
fund=' '
continue;
except Exception:
break;
break;
"""
#writer.writerows([title.replace(',',''),author.replace(',',''),cited.replace(',',''),fund.replace(',','')])  #写入CSV
data = []
#data.append((title, author, cited, fund))
data.append((title,'+', author,'+', cited,'+' ,fund))   #通过+或者其他符号通过xls分列
writer.writerows(data)

now_handle = driver.current_window_handle  # 获取当前窗口句柄
all_handles = driver.window_handles  # 获取所有窗口句柄
driver.find_element_by_xpath(
'//*[@id="body"]/div/div[3]/div[6]/div[2]/span[2]/a[2]').click()  # 下一页
time.sleep(2)
for handle in all_handles:
if (handle != now_handle):
driver.switch_to_window(handle)
# print("new web"+ driver.current_url)
f.close()

"""

//*[@id="body"]/div/div[3]/div[4]/dl[7]/dd[4]/a #展开更多
//*[@id="body"]/div/div[3]/div[4]/dl[8]/dd[4]/span[1] #无基金摘要
//*[@id="body"]/div/div[3]/div[4]/dl[9]/dd[4]/span# 基金
//*[@id="body"]/div/div[3]/div[4]/dl[9]/dd[5]/span#基金下的摘要

"""


根据需求改写了代码。。。

通过try把两种 展开更多以及没有展开更多的摘要都获取下来了。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: