您的位置:首页 > 其它

Download arxiv paper

2017-02-05 15:24 295 查看

1. Code

#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
###########
Usage:
python download.py site.txt(containing https://...) 
'''

from selenium import webdriver
import time
from pymouse import PyMouse

m = PyMouse()

def pause(length=1):
time.sleep(length)

def download(url):
b = webdriver.Firefox()
#b.set_page_load_timeout(60) # useless
b.maximize_window()
pause(1)

b.get(url)
pause(2)

loading_time = 60

dt = b.find_elements_by_tag_name('dt')
dd = b.find_elements_by_tag_name('dd')
assert(len(dt) == len(dd))
dst_type = "Computer Vision"

print b.get_window_size()
bias = [254, 171]
screenIsVertical = False
if screenIsVertical:
print "No implement when screen is vertical"
return
else:
pos = [b.get_window_size()['width']/2 + bias[0], b.get_window_size()['height']/2 + bias[1]]

for i in xrange(4, len(dt)):

# no Computer Vision paper
if dst_type not in dd[i].find_element_by_class_name('primary-subject').text:
continue

# no 'pdf' button
try:
dt[i].find_element_by_link_text('pdf').click()
except Exception, e:
continue

pause(loading_time)

b.find_element_by_id('download').click()
pause(2)

m.click(pos[0], pos[1], 1, 1)
time.sleep(1)

b.back()
time.sleep(1)
dt = b.find_elements_by_tag_name('dt')
dd = b.find_elements_by_tag_name('dd')

b.close()

def main():
import sys
if len(sys.argv) != 2:
print(__doc__)
return

with open(sys.argv[1], 'r') as fid:
urls = [x.split('\n')[0] for x in fid.readlines()]

for url in urls:
if url.startswith('#'):
continue
else:
download(url)

if __name__ == "__main__":
main()


2. Usage

python download.py site.txt


site.txt (example)

https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1 https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=25&query_id=a6b6ed358647ff57 #https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=50&query_id=a6b6ed358647ff57 https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=75&query_id=a6b6ed358647ff57[/code] 
You can use # to ignore specific url.

Refer this post for installing requirement.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: