百度收录信息抓取
2016-01-17 11:51
399 查看
#!/usr/bin/perl =pod 前段时间朋友让我帮他写个小工具用来从这个网站http://www.baidu.com/s?wd=site%3A 上来抓取一些信息比如:域名,www.example.com,百度权重,0,站内链接,域名IP,x.x.x.x,同IP网站,0,域名年龄,2月19天,索引量-百度收录,7等这些信息,抽时间就写了一下,我主要是通 LWP::Simple和正则来扑捉关键字。 =cut #!/usr/bin/perl use strict; use utf8; use Encode; use Encode::CN; use File::Find; use LWP::Simple; use POSIX qw(strftime); binmode(STDOUT, ":utf8"); #时间 my ($sec,$min,$hour,$day,$mon,$year,$wday,$yday,$isdst)=localtime(time()); $year = $year+1900; $mon = $mon + 1; my $date="$year-$mon-$day"; my $count; mkdir ("D:\\seo_web",0775) if ! -e "D:\\seo_web"; if (! -e "D:\\seo_web\\url.txt" ) { print "Please create 'D:\\seo_web\\url.txt' file\n and write info to 'D:\\seo_web\\url.txt'\n example:www.example.com in url.txt\n"; sleep(15); print "\n"; } open(my $FL, ">D:\\seo_web\\$date.csv.txt") or die "Cant write 'D:\\seo_web\\$date.csv.txt':$!\n"; open(BQURL, "<D:\\seo_web\\url.txt") or die "Can't open 'D:\\seo_web\\url.txt':$!\n"; while (<BQURL>) { my $val = "$_"; my $url = "http://seo.chinaz.com/?host=$val/"; my $content = get encode("utf-8",$url); die "Couldn't get $url" unless defined $content; for($content) { $count++; print $FL "$count,"; print "$count,"; if (/(请输入网站地址.*\/")/) { my $n = "$1"; if ($n =~/(\w+\.\w+\.\w+)/) { print $FL encode("utf8","域名,$1,"); print "$1,"; } } if (/(?:<span\s+style=\"\s+margin-left\:\d+px\;\">)(百度权重)/) { print $FL encode("utf8","$1,"); my $n = "$1"; } if (/(?:images\/baiduapp\/)(\d+)(?:\.gif)/) { print $FL encode("utf8","$1,"); print "$1,"; } if (/(出站链接.*<\/a>)/) { my $n = "$1\n"; if ($n =~/(\d+)/) { print $FL encode("utf8","出站链接,$1,"); print "$1,"; }else{ print $FL encode("utf8","出站链接,none,"); print "none,"; } } if (/(站内链接.*<\/span>)/) { my $n = "$1"; if ($n =~ /(站内链接)/) { print $FL encode("utf8","$1,"); if ($n =~ /(\d+)/) { $1 = "none" if (!$1); print $FL encode("utf8","$1,"); print "$1,"; } } } if (/(域名IP.* )/) { my $n = "$1"; if ($n =~/(\d+\.\d+\.\d+\.\d+)/) { $1 = "none" if (!$1); print $FL encode("utf8","域名IP,$1,"); print "$1,"; } } if (/(同IP网站.*个)/) { my $n = "$1"; if ($n =~ /(\d+)/) { print $FL encode("utf8","同IP网站,$1,"); print "$1,"; } } if (/(域名年龄.*<\/font>)/) { my $n = "$1"; if ($n = ~ /(\d+月\d+天)/) { $1 = "none" if (!$1); print $FL encode("utf8","域名年龄,$1,"); my $v = "$1"; $v=~ s/月/ Months /; $v=~ s/天/ Days/; print "$v,"; } } my $url = get("http://www.baidu.com/s?wd=site%3A$val"); for($url) { if(/(该网站共有.*<\/b>)/) { my $n = "$1"; if($n =~ /(\d+)(?:<\/b>)/) { $1 = "none" if (!$1); print $FL encode("utf8","索引量-百度收录,$1"); print "$1"; } } } } print $FL "\n"; print "\n"; } print "\n"; print " File path is:'D:\\seo_web\\$date.csv.txt'\n"; print " Author\@Laomeng\nEmail:18682093512\@163.com\n"; sleep(60);
相关文章推荐
- 房产界已上市和IPO路上的难兄难弟:房天下与房多多
- 利用百度地图提供的API做的应用
- 搜狗百度360市值齐跌:搜索引擎们陷入集体焦虑?
- 百度20年:搜索帝国的崛起、式微与重生
- 小白观察:Google 开始清除百度旗下公司开发的 46 款应用程序
- 百度全面恢复网站权重,流量暴涨中
- 百度工程师讲PHP函数的实现原理及性能分析(一)
- C#使用ImitateLogin模拟登录百度
- 百度 popup.js 完美修正版非常的不错 脚本之家推荐
- javascript实现类似百度分享功能的方法
- 百度空间的popup效果分析第1/3页
- jQuery实现的仿百度分页足迹效果代码
- 模仿百度三维地图的js数据分享
- 教你如何自定义百度分享插件以及bshare分享插件的分享按钮
- 百度最近不收站分析,为什么不收录你的站
- 百度手写板代码JavaScript远程调用的实现(鼠标输入法)
- php使用curl检测网页是否被百度收录的示例分享
- asp.net下百度的编码和解码
- 做网站要主要的百度分词技术
- 百度工程师讲PHP函数的实现原理及性能分析(三)