您的位置:首页 > 编程语言 > Go语言

curl抓取Google论坛搜索结果

2011-03-15 16:06 375 查看
1.Google就是厉害,做了防抓处理

2.
$url = "http://www.google.com.hk/search?q=%E5%9B%BD%E5%AE%B6%E7%94%B5%E7%BD%91&hl=zh-CN&newwindow=1&safe=strict&biw=1419&bih=715&prmdo=1&tbs=frm:1&ei=1attTe7qO4jKvQOszpzbBA&start=10&sa=N";
//$url = 'http://www.baidu.com';
echo '##1';
$page =  file_get_contents($url);
var_dump($page);


直接使用file_get_contents 报错:

file_get_contents(http://www.google.com.hk/search?q=%E5%9B%BD%E5%AE%B6%E7%94%B5%E7%BD%91&hl=zh-CN&newwindow=1&safe=strict&biw=1419&bih=715&prmdo=1&tbs=frm:1&ei=1attTe7qO4jKvQOszpzbBA&start=10&sa=N)
[function.file-get-contents]: failed to open stream: Redirection limit
reached, aborting

放弃使用file_get_contents();

3.

<?php
header("Content-type:text/html;charset=gbk");
$f = 0;
$s = 0;
for($i=0;$i<100;$i++){
echo "/r/n page {$i} --";
$start = $i*10;

$url = "http://www.google.com.hk/search?q=%E5%9B%BD%E5%AE%B6%E7%94%B5%E7%BD%91&hl=zh-CN&newwindow=1&safe=strict&biw=1419&bih=715&prmdo=1&tbs=frm:1&ei=1attTe7qO4jKvQOszpzbBA&start={$start}&sa=N";

$lists = get_google_bbs($url);
print_r($lists);
if(empty($lists)){
$j ++;
}else{
$s ++;
}
}
echo "/r/n  faild::{$f}---success::{$s}";

function get_google_bbs($url){
$surl = 'http://www.google.com.hk/';

$cookie_file = dirname(__FILE__)."/cookie.txt";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $surl);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)');
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
curl_exec($ch);
curl_close($ch);

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_REFERER, $surl);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)');
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
$contents = curl_exec($ch);
curl_close($ch);
//file_put_contents('google.html',$contents);
//$html =  file_get_contents('google.html');
$html = $contents;
$html = iconv('UTF-8','GBK',$html);
$p = "|<ol>(.*?)</ol>|ims";
if(preg_match($p,$html,$out)){
$ol_contents = $out[1];
$p = '|<li class=g>(.*?)</span></span></div></div>|ims';
if(preg_match_all($p,$ol_contents,$ol_out)){
$li_contents = $ol_out[1];
//print_r($li_contents);exit;
$i = 0;
foreach($li_contents as $li){

$p = '|<h3 class="r"><a href="(.*?)" mce_href="(.*?)" target=_blank class=l .*?>(.*?)</a></h3><button class=vspib></button></span><div class="s">(.*?)<br><span class=f>|ims';
if(preg_match($p,$li,$o_out)){
//print_r($o_out);
$url = $o_out[1];
if(!empty($o_out[2])){
list($title) = explode('-',$o_out[2]);
}

if(!empty($o_out[3])){
//echo "<br>",$o_out[3];
if(preg_match('|<div class="f">(.*?)个帖子.*?</div>|ims',$o_out[3],$reply_out)){
$reply = $reply_out[1];
//分析出来简介
$description = preg_replace('|(<div class="f">.*?</div>)|ims','',$o_out[3]);
}else{
$description = $o_out[3];
}
}
$op['title'] = $title;
$op['url'] = $url;
$op['reply'] = $reply;
$op['description'] = $description;
}
$all[] = $op;
}
return $all;
}

}else{
return array();
}
}

?>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: