您的位置:首页 > 其它

RCurl抓取

2016-04-20 16:10 274 查看


RCurl抓取


junjun


2016年4月20日

参考:http://www.w2bc.com/Article/28489


1、RCurl抓取验证码

RCurl这个程序包提供了由R到libcurl库的接口,从而实现HTTP的一些功能。例如,从 服务器下载文件、保持连接、上传文件、采用二进制格式读取、句柄重定向、密码认证等等。 什么是curl&libcurl – curl:利用URL语法在命令行方式下工作的开源文件传输工具 – curl背后的库就是libcurl 功能 – 获得页面 – 有关认证 – 上传下载 – 信息搜索 – ……
library(RCurl)

## Loading required package: bitops

myHttpheader <- c("Accept"="image/webp,*/*;q=0.8","Accept-Encoding"="gzip,deflate,sdch","Accept-Language"="zh-CN,zh;q=0.8","Cache-Control"="no-cache","Connection"="keep-alive","Pragma"="no-cache","Referer"="http://bbs.shangdu.com/s/regform.htm","User-Agent"="Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36")

png <- getBinaryURL("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ",httpheader=myHttpheader)

writeBin(png, con ="E:\\新技术\\爬虫\\验证码/123.png")

#shell.exec("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ")



2、RCurl三大函数


1)getURL函数

#1)判读URL是否存在
url.exists(url = "www.baidu.com")

## [1] TRUE

#收集调试信息
d <- debugGatherer()

#verbose = TRUE 这时候,d$value()值是会叠加的
tmp <- getURL(url="www.baidu.com", debugfunction=d$update, verbose=T)
names(d$value())

## [1] "text"       "headerIn"   "headerOut"  "dataIn"     "dataOut"
## [6] "sslDataIn"  "sslDataOut"

#服务器地址及端口号
cat(d$value()[1])

## Rebuilt URL to: www.baidu.com/
##   Trying 180.149.132.151...
## Connected to www.baidu.com (180.149.132.151) port 80 (#0)
## Connection #0 to host www.baidu.com left intact

#print(d$value()[1])

#服务器返回的头信息
cat(d$value()[2])

## HTTP/1.1 200 OK
## Date: Wed, 20 Apr 2016 08:04:09 GMT
## Content-Type: text/html
## Content-Length: 14613
## Last-Modified: Wed, 03 Sep 2014 02:48:32 GMT
## Connection: Keep-Alive
## Vary: Accept-Encoding
## Set-Cookie: BAIDUID=905A08993D710BA9389B6B3048107981:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: BIDUPSID=905A08993D710BA9389B6B3048107981; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: PSTM=1461139449; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## P3P: CP=" OTI DSP COR IVA OUR IND COM "
## Server: BWS/1.1
## X-UA-Compatible: IE=Edge,chrome=1
## Pragma: no-cache
## Cache-control: no-cache
## Accept-Ranges: bytes
##

#提交给服务器的头信息
cat(d$value()[3])

## GET / HTTP/1.1
## Host: www.baidu.com
## Accept: */*
##

#清除d$value(),清除之后全部为空
d$reset()
d$value()

##       text   headerIn  headerOut     dataIn    dataOut  sslDataIn
##         ""         ""         ""         ""         ""         ""
## sslDataOut
##         ""

#2)查看服务器返回的头信息,列表形式
h <- basicHeaderGatherer()
txt <- getURL(url="http://www.baidu.com", headerfunction=h$update)
names(h$value())

##  [1] "Date"            "Content-Type"    "Content-Length"
##  [4] "Last-Modified"   "Connection"      "Vary"
##  [7] "Set-Cookie"      "Set-Cookie"      "Set-Cookie"
## [10] "P3P"             "Server"          "X-UA-Compatible"
## [13] "Pragma"          "Cache-control"   "Accept-Ranges"
## [16] "status"          "statusMessage"

h$value()      # 所有的内容只是一个字符串

##                                                                                                                                Date
##                                                                                                     "Wed, 20 Apr 2016 08:04:10 GMT"
##                                                                                                                        Content-Type
##                                                                                                                         "text/html"
##                                                                                                                      Content-Length
##                                                                                                                             "14613"
##                                                                                                                       Last-Modified
##                                                                                                     "Wed, 03 Sep 2014 02:48:32 GMT"
##                                                                                                                          Connection
##                                                                                                                        "Keep-Alive"
##                                                                                                                                Vary
##                                                                                                                   "Accept-Encoding"
##                                                                                                                          Set-Cookie
## "BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
##                                                                                                                          Set-Cookie
##     "BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
##                                                                                                                          Set-Cookie
##                               "PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
##                                                                                                                                 P3P
##                                                                                              "CP=\" OTI DSP COR IVA OUR IND COM \""
##                                                                                                                              Server
##                                                                                                                           "BWS/1.1"
##                                                                                                                     X-UA-Compatible
##                                                                                                                  "IE=Edge,chrome=1"
##                                                                                                                              Pragma
##                                                                                                                          "no-cache"
##                                                                                                                       Cache-control
##                                                                                                                          "no-cache"
##                                                                                                                       Accept-Ranges
##                                                                                                                             "bytes"
##                                                                                                                              status
##                                                                                                                               "200"
##                                                                                                                       statusMessage
##                                                                                                                                "OK"

cat(h$value())  # 用cat显示的,会比较好看

## Wed, 20 Apr 2016 08:04:10 GMT text/html 14613 Wed, 03 Sep 2014 02:48:32 GMT Keep-Alive Accept-Encoding BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com CP=" OTI DSP COR IVA OUR IND COM " BWS/1.1 IE=Edge,chrome=1 no-cache no-cache bytes 200 OK

#查看url请求的访问信息
curl <- getCurlHandle()
txt <- getURL(url="http://www.baidu.com", curl=curl)
names(getCurlInfo(curl))

##  [1] "effective.url"           "response.code"
##  [3] "total.time"              "namelookup.time"
##  [5] "connect.time"            "pretransfer.time"
##  [7] "size.upload"             "size.download"
##  [9] "speed.download"          "speed.upload"
## [11] "header.size"             "request.size"
## [13] "ssl.verifyresult"        "filetime"
## [15] "content.length.download" "content.length.upload"
## [17] "starttransfer.time"      "content.type"
## [19] "redirect.time"           "redirect.count"
## [21] "private"                 "http.connectcode"
## [23] "httpauth.avail"          "proxyauth.avail"
## [25] "os.errno"                "num.connects"
## [27] "ssl.engines"             "cookielist"
## [29] "lastsocket"              "ftp.entry.path"
## [31] "redirect.url"            "primary.ip"
## [33] "appconnect.time"         "certinfo"
## [35] "condition.unmet"

getCurlInfo(curl)$response.code

## [1] 200

getCurlInfo(curl=curl)

## $effective.url
## [1] "http://www.baidu.com/"
##
## $response.code
## [1] 200
##
## $total.time
## [1] 0.063
##
## $namelookup.time
## [1] 0
##
## $connect.time
## [1] 0.016
##
## $pretransfer.time
## [1] 0.016
##
## $size.upload
## [1] 0
##
## $size.download
## [1] 14613
##
## $speed.download
## [1] 231952
##
## $speed.upload
## [1] 0
##
## $header.size
## [1] 750
##
## $request.size
## [1] 52
##
## $ssl.verifyresult
## [1] 0
##
## $filetime
## [1] -1
##
## $content.length.download
## [1] 14613
##
## $content.length.upload
## [1] -1
##
## $starttransfer.time
## [1] 0.032
##
## $content.type
## [1] "text/html"
##
## $redirect.time
## [1] 0
##
## $redirect.count
## [1] 0
##
## $private
## NULL
##
## $http.connectcode
## [1] 0
##
## $httpauth.avail
## [1] 0
##
## $proxyauth.avail
## [1] 0
##
## $os.errno
## [1] 0
##
## $num.connects
## [1] 1
##
## $ssl.engines
## [1] "dynamic"
##
## $cookielist
## character(0)
##
## $lastsocket
## [1] 652
##
## $ftp.entry.path
## NULL
##
## $redirect.url
## NULL
##
## $primary.ip
## [1] "180.149.132.151"
##
## $appconnect.time
## [1] 0
##
## $certinfo
## list()
##
## $condition.unmet
## [1] 0

#3)设置自己的header:设置自己的header,把系统设置成ihpone的系统Mac OS
myheader <- c(
"User-Agent"="Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
d <- debugGatherer()
tmp <- getURL(url = "http://www.baidu.com", httpheader=myheader, debugfunction=d$update, verbose=T)
cat(d$value()[3])

## GET / HTTP/1.1
## Host: www.baidu.com
## User-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7
## Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
## Accept-Language: en-us
## Connection: keep-alive
## Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
##

# 设置其他参数,共172个参数
listCurlOptions()

##   [1] "address.scope"              "append"
##   [3] "autoreferer"                "buffersize"
##   [5] "cainfo"                     "capath"
##   [7] "certinfo"                   "closepolicy"
##   [9] "connect.only"               "connecttimeout"
##  [11] "connecttimeout.ms"          "conv.from.network.function"
##  [13] "conv.from.utf8.function"    "conv.to.network.function"
##  [15] "cookie"                     "cookiefile"
##  [17] "cookiejar"                  "cookielist"
##  [19] "cookiesession"              "copypostfields"
##  [21] "crlf"                       "crlfile"
##  [23] "customrequest"              "debugdata"
##  [25] "debugfunction"              "dirlistonly"
##  [27] "dns.cache.timeout"          "dns.use.global.cache"
##  [29] "egdsocket"                  "encoding"
##  [31] "errorbuffer"                "failonerror"
##  [33] "file"                       "filetime"
##  [35] "followlocation"             "forbid.reuse"
##  [37] "fresh.connect"              "ftp.account"
##  [39] "ftp.alternative.to.user"    "ftp.create.missing.dirs"
##  [41] "ftp.filemethod"             "ftp.response.timeout"
##  [43] "ftp.skip.pasv.ip"           "ftp.ssl"
##  [45] "ftp.ssl.ccc"                "ftp.use.eprt"
##  [47] "ftp.use.epsv"               "ftpappend"
##  [49] "ftplistonly"                "ftpport"
##  [51] "ftpsslauth"                 "header"
##  [53] "headerfunction"             "http.content.decoding"
##  [55] "http.transfer.decoding"     "http.version"
##  [57] "http200aliases"             "httpauth"
##  [59] "httpget"                    "httpheader"
##  [61] "httppost"                   "httpproxytunnel"
##  [63] "ignore.content.length"      "infile"
##  [65] "infilesize"                 "infilesize.large"
##  [67] "interface"                  "ioctldata"
##  [69] "ioctlfunction"              "ipresolve"
##  [71] "issuercert"                 "keypasswd"
##  [73] "krb4level"                  "krblevel"
##  [75] "localport"                  "localportrange"
##  [77] "low.speed.limit"            "low.speed.time"
##  [79] "max.recv.speed.large"       "max.send.speed.large"
##  [81] "maxconnects"                "maxfilesize"
##  [83] "maxfilesize.large"          "maxredirs"
##  [85] "netrc"                      "netrc.file"
##  [87] "new.directory.perms"        "new.file.perms"
##  [89] "nobody"                     "noprogress"
##  [91] "noproxy"                    "nosignal"
##  [93] "opensocketdata"             "opensocketfunction"
##  [95] "password"                   "port"
##  [97] "post"                       "post301"
##  [99] "postfields"                 "postfieldsize"
## [101] "postfieldsize.large"        "postquote"
## [103] "postredir"                  "prequote"
## [105] "private"                    "progressdata"
## [107] "progressfunction"           "protocols"
## [109] "proxy"                      "proxy.transfer.mode"
## [111] "proxyauth"                  "proxypassword"
## [113] "proxyport"                  "proxytype"
## [115] "proxyusername"              "proxyuserpwd"
## [117] "put"                        "quote"
## [119] "random.file"                "range"
## [121] "readfunction"               "redir.protocols"
## [123] "referer"                    "resume.from"
## [125] "resume.from.large"          "seekdata"
## [127] "seekfunction"               "share"
## [129] "sockoptdata"                "sockoptfunction"
## [131] "socks5.gssapi.nec"          "socks5.gssapi.service"
## [133] "ssh.auth.types"             "ssh.host.public.key.md5"
## [135] "ssh.private.keyfile"        "ssh.public.keyfile"
## [137] "ssl.cipher.list"            "ssl.ctx.data"
## [139] "ssl.ctx.function"           "ssl.sessionid.cache"
## [141] "ssl.verifyhost"             "ssl.verifypeer"
## [143] "sslcert"                    "sslcertpasswd"
## [145] "sslcerttype"                "sslengine"
## [147] "sslengine.default"          "sslkey"
## [149] "sslkeypasswd"               "sslkeytype"
## [151] "sslversion"                 "stderr"
## [153] "tcp.nodelay"                "telnetoptions"
## [155] "tftp.blksize"               "timecondition"
## [157] "timeout"                    "timeout.ms"
## [159] "timevalue"                  "transfertext"
## [161] "unrestricted.auth"          "upload"
## [163] "url"                        "use.ssl"
## [165] "useragent"                  "username"
## [167] "userpwd"                    "verbose"
## [169] "writedata"                  "writefunction"
## [171] "writeheader"                "writeinfo"



2)getForm()函数

# wd=rcurl 这里就是关键字为rcurl
url <- c("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&ch=&tn=SE_hldp02870_0v135xhf&bar=&wd=rcurl&rsv_spt=1&rsv_pq=a3ed162a0088df8f&rsv_t=43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q&rsv_enter=1&inputT=2119")

# 查看url的结构和值
getFormParams(query=url)

##                                                                                 ie
##                                                                            "utf-8"
##                                                                                  f
##                                                                                "8"
##                                                                             rsv_bp
##                                                                                "1"
##                                                                            rsv_idx
##                                                                                "2"
##                                                                                 ch
##                                                                                 ""
##                                                                                 tn
##                                                            "SE_hldp02870_0v135xhf"
##                                                                                bar
##                                                                                 ""
##                                                                                 wd
##                                                                            "rcurl"
##                                                                            rsv_spt
##                                                                                "1"
##                                                                             rsv_pq
##                                                                 "a3ed162a0088df8f"
##                                                                              rsv_t
## "43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q"
##                                                                          rsv_enter
##                                                                                "1"
##                                                                             inputT
##                                                                             "2119"

names(getFormParams(query=url))

##  [1] "ie"        "f"         "rsv_bp"    "rsv_idx"   "ch"
##  [6] "tn"        "bar"       "wd"        "rsv_spt"   "rsv_pq"
## [11] "rsv_t"     "rsv_enter" "inputT"

tmp <- getForm(uri="http://www.baidu.com/s", ie="utf-8", f="8", rsv_bp="1", rsv_idx="2", ch="", tn="SE_hldp02870_0v135xhf", bar="", wd="rcurl", rsv_spt="1", rsv_pq="a3ed162a0088df8f", rsv_t="43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q", rsv_enter="1", inputT="2119")
# 注意:getForm函数不稳定(原因还不知道),有时候运行2到3次,才能真正找到页面



3)getBinaryURL()下载一个文件和批量下载文件

#1)下载一个文件
url <- "http://rfunction.com/code/1201/120103.R"
tmp <- getBinaryURL(url)
note <- file("E:\\新技术\\爬虫\\验证码/120103.R", open = "wb")
writeBin(tmp, note)
close(note)

#2)批量下载文件
url <- "http://rfunction.com/code/1202/"
tmp <- RCurl::getURL(url, httpheader = myheader) # 获取网页
tmp_files <- strsplit(x=tmp, split="<li><a href=\"")[[1]]
tmp_files1 <- strsplit(tmp_files, split="\"")
tmp_files2 <- lapply(X=tmp_files1, function(file) {file[1]})
files <- unlist(tmp_files2)
files <- files[c(-1, -2)]
baseURL <- "http://rfunction.com/code/1202/"
for(i in 1:length(files)){
fullURL <- paste(baseURL, files[i], sep = "")
tmp <- getBinaryURL(fullURL)
note <- file(paste("E:\\新技术\\爬虫\\验证码/1202-", files[i], sep = ""), open = "wb")
writeBin(tmp, note)
close(note)
Sys.sleep(0.002) # 休眠0.002秒
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: