Java初学者实践:httpclient+HTMLParser 的应用
2010-10-07 14:18
459 查看
使用httpclient调用google在线翻译以及爱词霸在线翻译,并使用HTMLParser对返回的结果进行处理,以此实现一个简单的在线翻译小程序,并利用多线程编程合并两个翻译工具。具体实现如下:
1.multithreadDict.java
注:本来我想先显示google翻译的结果,后显示爱词霸翻译的结果,但即使使用sleep函数也很难保证这一点。
Java线程调度是Java多线程的核心,只有良好的调度,才能充分发挥系统的性能,提高程序的执行效率。
这里要明确的一点,不管程序员怎么编写调度,只能最大限度的影响线程执行的次序,而不能做到精准控制。
线程休眠是使线程让出CPU的最简单的做法之一,线程休眠时候,会将CPU资源交给其他线程,以便能轮换执行,当休眠一定时间后,线程会苏醒,进入准备状态等待执行。
线程休眠的方法是Thread.sleep(long millis) 和Thread.sleep(long millis,int nanos) ,均为静态方法。简单说,哪个线程调用sleep,就休眠哪个线程。
2.GoogleDict.java
注:在使用HTMLParser处理google翻译返回的结果时,由于同时存在<span>...</span>,<div>...</div>,<span><div>...</div></span>三种标签,导致处理比较困难,个人对HTMLParser库也不是很熟悉,所以最终所得结果并不是很满意。
3.IcibaDict.java
view source
print?
1.multithreadDict.java
01 | package MultiTread; |
02 | import java.util.Scanner; |
03 | public class multithreadDict { |
04 | public static void main(String[] args) throws InterruptedException { |
05 | System.out.print( "Input the word > " ); |
06 | Scanner s = new Scanner(System.in); |
07 | String input= s.nextLine(); |
08 |
09 | GoogleDictgoogle= new GoogleDict(input); |
10 | google.start(); |
11 | //google.sleep(2000); |
12 | IcibaDicticiba= new IcibaDict(input); |
13 | iciba.start(); |
14 | //iciba.sleep(4000); |
15 | } |
16 | } |
Java线程调度是Java多线程的核心,只有良好的调度,才能充分发挥系统的性能,提高程序的执行效率。
这里要明确的一点,不管程序员怎么编写调度,只能最大限度的影响线程执行的次序,而不能做到精准控制。
线程休眠是使线程让出CPU的最简单的做法之一,线程休眠时候,会将CPU资源交给其他线程,以便能轮换执行,当休眠一定时间后,线程会苏醒,进入准备状态等待执行。
线程休眠的方法是Thread.sleep(long millis) 和Thread.sleep(long millis,int nanos) ,均为静态方法。简单说,哪个线程调用sleep,就休眠哪个线程。
2.GoogleDict.java
01 | package MultiTread; |
02 | import java.io.File; |
03 | import java.io.FileWriter; |
04 | import java.net.URI; |
05 | import org.apache.http.HttpEntity; |
06 | import org.apache.http.HttpResponse; |
07 | import org.apache.http.client.HttpClient; |
08 | import org.apache.http.client.methods.HttpGet; |
09 | import org.apache.http.client.utils.URIUtils; |
10 | import org.apache.http.impl.client.DefaultHttpClient; |
11 | import org.apache.http.util.EntityUtils; |
12 | import org.htmlparser.Node; |
13 | import org.htmlparser.NodeFilter; |
14 | import org.htmlparser.Parser; |
15 | import org.htmlparser.filters.OrFilter; |
16 | import org.htmlparser.filters.TagNameFilter; |
17 | import org.htmlparser.util.NodeList; |
18 | import org.htmlparser.visitors.TextExtractingVisitor; |
19 | class GoogleDict extends Thread |
20 | { |
21 | private String searchterm= null ; |
22 | public GoogleDict(String input) |
23 | { |
24 | this .searchterm=input; |
25 | } |
26 | public void run() |
27 | { |
28 | String text= null ; |
29 | //http://www.google.com/dictionary?source=translation&hl=zh-CN&q=computer&langpair=en|zh-CN |
30 | try |
31 | { |
32 | HttpClient httpclient = new DefaultHttpClient(); |
33 | String searchstring = "source=translation&hl=zh-CN&q=" + searchterm + "&langpair=en%7Czh-CN" ; |
34 | URI uri=URIUtils.createURI( "http" , "www.google.com" ,- 1 , "/dictionary" ,searchstring, null ); |
35 | HttpGet httpget = new HttpGet(uri); |
36 | HttpResponse response = httpclient.execute(httpget); |
37 | HttpEntity entity = response.getEntity(); |
38 |
39 | if (entity != null ) { |
40 | Parser parser = new Parser(EntityUtils.toString(entity)); |
41 | parser.setEncoding( "gb2312" ); |
42 | //NodeFilter filter_tab_content =new OrFilter( new TagNameFilter("div"),new TagNameFilter("span")); |
43 | NodeFilter filter_tab_content= new TagNameFilter( "div" ); |
44 | //NodeFilter filter_tab_content=new TagNameFilter("span"); |
45 | NodeList nodelist_tab_content = parser.parse(filter_tab_content); |
46 | int length = nodelist_tab_content.size(); |
47 | if (searchterm.getBytes().length==searchterm.length()) |
48 | { |
49 | for ( int i = 10 ; i < length- 3 ; i++) { |
50 | Node node_tab_content = nodelist_tab_content.elementAt(i); |
51 | Parser parser_tab_content = new Parser(node_tab_content |
52 | .toHtml()); |
53 | TextExtractingVisitor visitor_tab_content = new TextExtractingVisitor(); |
54 | parser_tab_content.visitAllNodesWith(visitor_tab_content); |
55 | text = text+ "/n" +visitor_tab_content.getExtractedText().trim(); |
56 | } |
57 | } |
58 | else |
59 | { |
60 | for ( int i = 8 ; i < length- 3 ; i++) { |
61 | Node node_tab_content = nodelist_tab_content.elementAt(i); |
62 | Parser parser_tab_content = new Parser(node_tab_content |
63 | .toHtml()); |
64 | TextExtractingVisitor visitor_tab_content = new TextExtractingVisitor(); |
65 | parser_tab_content.visitAllNodesWith(visitor_tab_content); |
66 | text = text+ "/n" +visitor_tab_content.getExtractedText().trim(); |
67 | } |
68 | } |
69 | text=text.replaceAll( "相关搜索" , "相关搜索:" ); |
70 | text=text.replaceAll( "null" , "" ); |
71 | text=text.replaceAll( "/n/n" , "/n" ); |
72 | text=text.replaceAll( "/n/n" , "/n" ); |
73 | text=text.replaceAll( "/n/n" , "/n" ); |
74 |
75 | System.out.println( "-----------------------------------------" + |
76 | "谷歌翻译-------------------------------------------" ); |
77 | System.out.println(uri); |
78 |
79 | System.out.println(text); |
80 | File f = new File( "D://study/Java/GoogleDict/" + searchterm + ".txt" ); |
81 | FileWriter fw = new FileWriter(f); |
82 | fw.write(text); |
83 | fw.flush(); |
84 | fw.close(); |
85 | } |
86 | } |
87 | catch (Exception e) |
88 | { |
89 | e.printStackTrace(); |
90 | } |
91 | } |
92 | } |
3.IcibaDict.java
view source
print?
01 | package MultiTread; |
02 | import java.io.File; |
03 | import java.io.FileWriter; |
04 | import org.apache.http.HttpEntity; |
05 | import org.apache.http.HttpResponse; |
06 | import org.apache.http.client.HttpClient; |
07 | import org.apache.http.client.methods.HttpGet; |
08 | import org.apache.http.impl.client.DefaultHttpClient; |
09 | import org.apache.http.util.EntityUtils; |
10 | import org.htmlparser.Node; |
11 | import org.htmlparser.NodeFilter; |
12 | import org.htmlparser.Parser; |
13 | import org.htmlparser.filters.AndFilter; |
14 | import org.htmlparser.filters.HasAttributeFilter; |
15 | import org.htmlparser.filters.TagNameFilter; |
16 | import org.htmlparser.util.NodeList; |
17 | import org.htmlparser.visitors.TextExtractingVisitor; |
18 | class IcibaDict extends Thread |
19 | { |
20 | private String searchterm= null ; |
21 | public IcibaDict(String input) |
22 | { |
23 | this .searchterm=input; |
24 | } |
25 | public void run() |
26 | { |
27 | String text= null ,webContent= null ; |
28 | try |
29 | { |
30 | HttpClient httpclient = new DefaultHttpClient(); |
31 | String searchstring = "http://www.iciba.com/" + searchterm + "/" ; |
32 | HttpGet httpget = new HttpGet(searchstring); |
33 | HttpResponse response = httpclient.execute(httpget); |
34 | HttpEntity entity = response.getEntity(); |
35 | if (entity != null ) { |
36 | String content=EntityUtils.toString(entity); |
37 | content=content.replaceAll( "<a href" , " <a href" ); |
38 | Parser parser = new Parser(content); |
39 | parser.setEncoding( "gb2312" ); |
40 | NodeFilter filter_tab_content = new AndFilter( new TagNameFilter( |
41 | "div" ), new HasAttributeFilter( "class" , "tab_content" )); |
42 | NodeList nodelist_tab_content = parser.parse(filter_tab_content); |
43 | int length = nodelist_tab_content.size(); |
44 | for ( int i = 0 ; i < length; i++) { |
45 | Node node_tab_content = nodelist_tab_content.elementAt(i); |
46 | Parser parser_tab_content = new Parser(node_tab_content |
47 | .toHtml()); |
48 | TextExtractingVisitor visitor_tab_content = new TextExtractingVisitor(); |
49 | parser_tab_content.visitAllNodesWith(visitor_tab_content); |
50 | text = text+ "/n" +visitor_tab_content.getExtractedText().trim(); |
51 | } |
52 | parser.reset(); |
53 | NodeFilter filter_web = new AndFilter( new TagNameFilter( |
54 | "div" ), new HasAttributeFilter( "class" , "content_block" )); |
55 | NodeList nodelist_web = parser.parse(filter_web); |
56 | Node node_web = nodelist_web.elementAt( 0 ); |
57 | if (node_web!= null ) |
58 | { |
59 | Parser parser_web = new Parser(node_web.toHtml()); |
60 | TextExtractingVisitor visitor_web = new TextExtractingVisitor(); |
61 | parser_web.visitAllNodesWith(visitor_web); |
62 | webContent=visitor_web.getExtractedText().trim(); |
63 | } |
64 | text=text+webContent; |
65 | text=text.replaceAll( " " , "" ); |
66 | text=text.replaceAll( " " , "" ); |
67 | text=text.replaceAll( " " , "/n" ); |
68 | text=text.replaceAll( "/n/n/n" , "/n" ); |
69 | text=text.replaceAll( "/n/n" , "/n" ); |
70 | text=text.replaceAll( "/n/n" , "/n" ); |
71 | text=text.replaceAll( " " , "" ); |
72 | text=text.replace( "null" , "" ); |
73 | text=text.replace( "相关搜索" , "" ); |
74 | text=text.replace( "句库" , "" ); |
75 | text=text.replace( "韦氏词典" , "" ); |
76 | text=text.replace( "Dictionary" , "" ); |
77 |
78 | System.out.println( "*************************************" + |
79 | "爱词霸翻译*************************************" ); |
80 | System.out.println(searchstring); |
81 |
82 | System.out.println(text); |
83 |
84 | File f = new File( "D://study/Java/IcibaDict/" + searchterm + ".txt" ); |
85 | FileWriter fw = new FileWriter(f); |
86 | fw.write(text); |
87 | fw.flush(); |
88 | fw.close(); |
89 | } |
90 | } |
91 | catch (Exception e) |
92 | { |
93 | e.printStackTrace(); |
94 | } |
95 | } |
96 | } |
相关文章推荐
- Java初学者实践:httpclient+HTMLParser 的应用
- Java:HttpClient篇,HttpClient4.2在Java中的几则应用:Get、Post参数、Session(会话)保持、Proxy(代理服务器)设置,多线程设置...
- java应用集锦9:httpclient4.2.2的几个常用方法,登录之后访问页面问题,下载文件
- java 模拟 POST表单操作 HTTPCLIENT的应用 及注意事项
- Java:HttpClient篇,Cookie概述,及其在HttpClient4.2中的应用
- crawler_java应用集锦9:httpclient4.2.2的几个常用方法,登录之后访问页面问题,下载文件_设置代理
- Java:HttpClient篇,HttpClient4.2在Java中的几则应用:Get、Post参数、Session(会话)保持、Proxy(代理服务器)设置,多线程设置...
- Java:HttpClient篇,HttpClient4.2在Java中的几则应用:Get、Post参数、Session(会话)保持、Proxy(代理服务器)设置,多线程设置...
- commons-httpclient和htmlparser应用之博客搬家
- 基于Java HttpClient和Htmlparser实现网络爬虫代码
- Java:HttpClient篇,Cookie概述,及其在HttpClient4.2中的应用
- Java HttpClient 的简单应用
- java JWS 应用及实践
- Java HttpClient4 get方式多线程下载文件
- Java-HttpClient-03
- JavaWeb开发中的HttpServletRequest的应用
- [Java] HttpClient有个古怪的stalecheck选项
- java初学者实践教程25-多线程
- java初学者实践教程9-数组
- Java用org.apache.http.client的HttpClient发送Post请求 可获取返回Header