python 批量下载知网(CNKI)论文
2017-03-20 15:09
816 查看
1、目的:
朋友找我去知网帮他下载点相关论文,发现老是要去点击文章。点击下载,显得很麻烦,百度一下,别人的方法太复杂,所以自己写了一个python脚本自动下载知网论文。
2、前期准备
1)安装python 2.7
2)安装 selenium
3)下载一个chromedriver.exe,放到脚本同一个文件夹内
4)安装chrome浏览器
3、直接撸代码
(a)指定关键字下载知网论文
(b)指定论文题目下载知网论文
这个需要和脚本同目录下新建一个downfile.txt,按行存放需要下载题目
很好用,让我帮助同学下载知网论文,妈妈再也不要担心我点错了。。。
朋友找我去知网帮他下载点相关论文,发现老是要去点击文章。点击下载,显得很麻烦,百度一下,别人的方法太复杂,所以自己写了一个python脚本自动下载知网论文。
2、前期准备
1)安装python 2.7
2)安装 selenium
pip install selenium
3)下载一个chromedriver.exe,放到脚本同一个文件夹内
4)安装chrome浏览器
3、直接撸代码
(a)指定关键字下载知网论文
downloadCNKI.py
#!/usr/bin/env Python # coding=utf-8 import os from time import sleep from selenium import webdriver def browser_init(isWait): options = webdriver.ChromeOptions() prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'} options.add_experimental_option('prefs', prefs) browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options) browser.set_window_size(500,500) if isWait: browser.implicitly_wait(50) return browser def searchKey(keyword): browser.get("http://kns.cnki.net/kns/brief/default_result.aspx") browser.find_element_by_id('txt_1_value1').send_keys(keyword) browser.find_element_by_id('btnSearch').click() def switchToFrame(browser): #print 'start switch' browser.switch_to.frame('iframeResult') #print 'end switch' def getDownloadLinks(browser,paper_downloadLinks): for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'): #link.click() url=link.get_attribute('href') url_part = url.split('&')[3:6] url_str= '&'.join(url_part) down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str #print down_url paper_downloadLinks.append(down_url) def switchToPage(browser,n): for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'): url=link.get_attribute('href') print url pageInd='curpage=%d&'%n print pageInd if pageInd in url: print "page: "+url link.click() break def switchNextPage(browser): browser.find_element_by_link_text(u'下一页').click() def do_download(driver,urls,fail_downLoadUrl): for url in urls: print url driver.get(url) paper_title=driver.title print "paper title"+paper_title if u'中国专利全文数据库' in paper_title: continue print "try download :"+paper_title try: driver.find_element_by_xpath("//a[contains(text(),'PDF下载')]").click() print "download success!!!" except Exception as e: try: driver.find_element_by_xpath("//a[contains(text(),'整本下载')]").click() print "download success!!!" except Exception as e: print "download fail!!!" fail_downLoadUrl.append(url) def usage(): print "example : python downloadCNKI.py -k keyword -p 1" if __name__=="__main__": keyword=u'三角形' #论文搜索的关键字 pageNum = 1 # 下载多少页的论文 browser=browser_init(True) searchKey(keyword) switchToFrame(browser) paper_downloadLinks = [] #论文下载链接 e6fb curPage=1 while curPage<=pageNum: getDownloadLinks(browser,paper_downloadLinks) switchNextPage(browser); curPage+=1 browser.quit() print "采集了%d条数据"% len(paper_downloadLinks) driver=browser_init(False) fail_downLoadUrl=[] #记录下失败的网站 do_download(driver,paper_downloadLinks,fail_downLoadUrl) print fail_downLoadUrl tryNum=0 #尝试N次重新下载没有下载的 while tryNum<5: if len(fail_downLoadUrl) !=0: paper_downloadLinks=fail_downLoadUrl fail_downLoadUrl=[] do_download(driver, paper_downloadLinks, fail_downLoadUrl) print fail_downLoadUrl else: break tryNum+=1 sleep(60) driver.quit()
(b)指定论文题目下载知网论文
这个需要和脚本同目录下新建一个downfile.txt,按行存放需要下载题目
指定题目到downfile.txt的知网下载.py
#!/usr/bin/env Python # coding=utf-8 import os from time import sleep from selenium import webdriver def browser_init(isWait): options = webdriver.ChromeOptions() prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'} options.add_experimental_option('prefs', prefs) browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options) browser.set_window_size(500,500) if isWait: browser.implicitly_wait(50) return browser def searchKey(keyword): browser.get("http://kns.cnki.net/kns/brief/default_result.aspx") browser.find_element_by_id('txt_1_value1').send_keys(keyword) browser.find_element_by_id('btnSearch').click() def switchToFrame(browser): #print 'start switch' browser.switch_to.frame('iframeResult') #print 'end switch' def getDownloadLinks(browser,paper_downloadLinks): for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'): #link.click() url=link.get_attribute('href') url_part = url.split('&')[3:6] url_str= '&'.join(url_part) down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str #print down_url paper_downloadLinks.append(down_url) def getKeywordDownloadLink(browser,keyword,paper_downloadLinks): link=browser.find_element_by_link_text(keyword) url = link.get_attribute('href') #print url url_part = url.split('&')[3:6] url_str = '&'.join(url_part) down_url = 'http://kns.cnki.net/KCMS/detail/detail.aspx?' + url_str #print down_url paper_downloadLinks.append(down_url) def switchToPage(browser,n): for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'): url=link.get_attribute('href') print url pageInd='curpage=%d&'%n print pageInd if pageInd in url: print "page: "+url link.click() break def switchNextPage(browser): browser.find_element_by_link_text(u'下一页').click() def do_download(driver,urls,fail_downLoadUrl): for url in urls: print url driver.get(url) paper_title=driver.title print "paper title"+paper_title if u'数据库' in paper_title: continue print "try download :"+paper_title try: driver.find_element_by_xpath("//a[contains(text(),'PDF下载')]").click() print "download success!!!" except Exception as e: try: driver.find_element_by_xpath("//a[contains(text(),'整本下载')]").click() print "download success!!!" except Exception as e: print "download fail!!!" fail_downLoadUrl.append(url) def usage(): print "example : python downloadCNKI.py -k keyword -p 1" if __name__=="__main__": paper_downloadLinks = [] # 论文下载链接 pageNum = 1 # 下载多少页的论文 browser = browser_init(True) file = open("downfile.txt") lineDatas = file.readlines(); for line in lineDatas: keyword=line.strip('\n').decode('gbk') #keyword=u'三角形' #论文搜索的关键字 print u"采集: %s"% keyword searchKey(keyword) switchToFrame(browser) downloadLinks=[] getKeywordDownloadLink(browser,keyword,downloadLinks) paper_downloadLinks.append(''.join(downloadLinks)) file.close() browser.quit() print "采集了%d条数据"% len(paper_downloadLinks) driver=browser_init(False) fail_downLoadUrl=[] #记录下失败的网站 do_download(driver,paper_downloadLinks,fail_downLoadUrl) print fail_downLoadUrl tryNum=0 #尝试N次重新下载没有下载的 while tryNum<5: if len(fail_downLoadUrl) !=0: paper_downloadLinks=fail_downLoadUrl fail_downLoadUrl=[] do_download(driver, paper_downloadLinks, fail_downLoadUrl) print "重新下载 ", print fail_downLoadUrl else: break tryNum+=1 sleep(60) driver.quit()
很好用,让我帮助同学下载知网论文,妈妈再也不要担心我点错了。。。
相关文章推荐
- Python 实现 CNKI批量下载 和FireFox Extension 入门学习笔记
- CNKI知网论文下载工具
- Python代码练习--批量下载
- python写的批量操作远程主机脚本(命令执行,上传、下载文件)
- 登陆新浪微博&批量下载收藏内容[Python脚本实现]
- python写的批量操作远程主机脚本(命令执行,上传、下载文件)
- python 批量下载文件
- python写的批量下载baidu mp3的程序 至少到09-9-18仍然可用
- PYthon 批量下载网页图片
- CSDN博客专栏文章批量下载脚本[python实现]
- python写的批量操作远程主机脚本(命令执行,上传、下载文件)
- Python抓取网页&批量下载文件方法初探(正则表达式+BeautifulSoup)
- 批量下载RFC文档(python实现)
- python写的批量操作远程主机脚本(命令执行,上传、下载文件)
- CSDN博客专栏文章批量下载脚本[python实现]
- Python代码练习--批量下载(改进篇)
- python写的批量操作远程主机脚本(命令执行,上传、下载文件)
- 使用Python批量下载数据
- 利用Python脚本实现-----登陆新浪微博&批量下载收藏内容