您的位置:首页 > 其它

erlang实现简单爬虫

2012-10-31 17:28 267 查看
-module(spider).
-compile(export_all).
-import(lists,[reverse/1,reverse/2,map/2]).

nano_get_url(Host) ->
{ok,Socket} = gen_tcp:connect(Host, 80, [binary,{packet,0}]),
ok = gen_tcp:send(Socket,"GET / HTTP/1.0\r\n\r\n"),
receive_data(Socket,[]).

receive_data(Socket,SoFar) ->
receive
{tcp,Socket,Bin} ->
receive_data(Socket,[Bin|SoFar]);
{tcp_closed,Socket} ->
list_to_binary(lists:reverse(SoFar))
end.

urls2htmlFile(Urls,File) ->
file:write_file(File,urls2html(Urls)).

bin2urls(Bin) -> gather_urls(binary_to_list(Bin),[]).

bin2urls2(Bin) -> gather_urls2(binary_to_list(Bin),[]).

urls2html(Urls) -> [h1("Urls"),make_list(Urls)].

h1(Title) -> ["<h1>",Title,"</h1>\n"].

make_list(L) ->
["<u1>\n",
map(fun(I) -> ["<li>",I,"</li>\n"] end, L),
"</u1>\n"].

gather_urls("<a href" ++ T,L) ->
{Url,T1}=collect_url_body(T,reverse("<a href")),

%%      case spider1:gather_urls(Url, []) of
%%                              []->
%%                                  nothing;
%%                              [SubUrl]->
%%                                  io:format("SubUrl:~p~n",[SubUrl]),
%%                                  make2(SubUrl)
%%                          end,
gather_urls(T1,[Url|L]);

gather_urls([_|T],L) ->
gather_urls(T,L);

gather_urls([],L) ->
L.

gather_urls2("<a href" ++ T,L) ->
{Url,T1}=collect_url_body(T,reverse("<a href")),
gather_urls2(T1,[Url|L]);

gather_urls2("<link href" ++ T,L) ->
{Url,T1}=collect_url_body(T,reverse("<link href")),
gather_urls2(T1,[Url|L]);

gather_urls2([_|T],L) ->
gather_urls2(T,L);
gather_urls2([],L) ->
L.

collect_url_body("</a>" ++ T,L) -> {reverse(L,"</a>"),T};
collect_url_body(">" ++ T,L) -> {reverse(L,">"),T};
collect_url_body([H|T],L)       -> collect_url_body(T,[H|L]);
collect_url_body([],_)          -> {[],[]}.

make()->
B=nano_get_url("www.baidu.com"),
L=bin2urls(B),

MakeSubFun  =  fun(Url)->
io:format("Url1:~p~n",[Url]),
case spider1:gather_urls(Url, []) of
[]->
nothing;
[SubUrl]->
io:format("SubUrl:~p~n",[SubUrl]),
make2(SubUrl)
end
end,
lists:foreach(MakeSubFun , L),
urls2htmlFile(L,"http\\1.html").

make2(SubUrl)->
B=nano_get_url(SubUrl),
%%     io:format(B),
L=bin2urls2(B),
urls2htmlFile(L,"http\\"++SubUrl++".html").


-module(spider1).

-compile(export_all).

-import(lists,[reverse/1,reverse/2]).

gather_urls("http://" ++ T,L) ->
{Url,T1}=collect_url_body(T,reverse("")),
gather_urls(T1,[Url|L]);

gather_urls([_|T],L) ->
gather_urls(T,L);
gather_urls([],L) ->
L.

collect_url_body("/" ++ T, W) -> {reverse(W,""),T};
collect_url_body("\"" ++ T, W) -> {reverse(W,""),T};
collect_url_body([Q|T1],W) -> collect_url_body(T1,[Q|W]);
collect_url_body([],_) -> {[],[]}.


运行结果如下:


上述实现抓取链接及它的二级链接,抓取<a href />标签后的链接,只能爬取链接.现在看到不少关于erlang爬虫的文章.可以爬取图片等.代码写的有点儿凌乱,没有优化,仅供参考.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: