您的位置:首页 > 理论基础 > 计算机网络

Java初学者实践:httpclient+HTMLParser 的应用

2010-10-07 14:18 459 查看
使用httpclient调用google在线翻译以及爱词霸在线翻译,并使用HTMLParser对返回的结果进行处理,以此实现一个简单的在线翻译小程序,并利用多线程编程合并两个翻译工具。具体实现如下:

1.multithreadDict.java

01
package
MultiTread;
02
import
java.util.Scanner;
03
public
class
multithreadDict {
04
public
static
void
main(String[] args)
throws
InterruptedException {
05
System.out.print(
"Input the word > "
);
06
Scanner s =
new
Scanner(System.in);
07
String input= s.nextLine();
08
 
09
GoogleDictgoogle=
new
GoogleDict(input);
10
google.start();
11
//google.sleep(2000);
12
IcibaDicticiba=
new
IcibaDict(input);
13
iciba.start();
14
//iciba.sleep(4000);
15
}
16
}
注:本来我想先显示google翻译的结果,后显示爱词霸翻译的结果,但即使使用sleep函数也很难保证这一点。

Java线程调度是Java多线程的核心,只有良好的调度,才能充分发挥系统的性能,提高程序的执行效率。

这里要明确的一点,不管程序员怎么编写调度,只能最大限度的影响线程执行的次序,而不能做到精准控制。
线程休眠是使线程让出CPU的最简单的做法之一,线程休眠时候,会将CPU资源交给其他线程,以便能轮换执行,当休眠一定时间后,线程会苏醒,进入准备状态等待执行。

线程休眠的方法是Thread.sleep(long millis) 和Thread.sleep(long millis,int nanos) ,均为静态方法。简单说,哪个线程调用sleep,就休眠哪个线程。

2.GoogleDict.java

01
package
MultiTread;
02
import
java.io.File;
03
import
java.io.FileWriter;
04
import
java.net.URI;
05
import
org.apache.http.HttpEntity;
06
import
org.apache.http.HttpResponse;
07
import
org.apache.http.client.HttpClient;
08
import
org.apache.http.client.methods.HttpGet;
09
import
org.apache.http.client.utils.URIUtils;
10
import
org.apache.http.impl.client.DefaultHttpClient;
11
import
org.apache.http.util.EntityUtils;
12
import
org.htmlparser.Node;
13
import
org.htmlparser.NodeFilter;
14
import
org.htmlparser.Parser;
15
import
org.htmlparser.filters.OrFilter;
16
import
org.htmlparser.filters.TagNameFilter;
17
import
org.htmlparser.util.NodeList;
18
import
org.htmlparser.visitors.TextExtractingVisitor;
19
class
GoogleDict
extends
Thread
20
{
21
private
String searchterm=
null
;
22
public
GoogleDict(String input)
23
{
24
this
.searchterm=input;
25
}
26
public
void
run()
27
{
28
String text=
null
;
29
//http://www.google.com/dictionary?source=translation&hl=zh-CN&q=computer&langpair=en|zh-CN
30
try
31
{
32
HttpClient httpclient =
new
DefaultHttpClient();
33
String searchstring =
"source=translation&hl=zh-CN&q="
+ searchterm +
"&langpair=en%7Czh-CN"
;
34
URI uri=URIUtils.createURI(
"http"
,
"www.google.com"
,-
1
,
"/dictionary"
,searchstring,
null
);
35
HttpGet httpget =
new
HttpGet(uri);
36
HttpResponse response = httpclient.execute(httpget);
37
HttpEntity entity = response.getEntity();
38
 
39
if
(entity !=
null
) {
40
Parser parser =
new
Parser(EntityUtils.toString(entity));
41
parser.setEncoding(
"gb2312"
);
42
//NodeFilter filter_tab_content =new OrFilter( new  TagNameFilter("div"),new TagNameFilter("span"));
43
NodeFilter filter_tab_content=
new
TagNameFilter(
"div"
);
44
//NodeFilter filter_tab_content=new TagNameFilter("span");
45
NodeList nodelist_tab_content = parser.parse(filter_tab_content);
46
int
length = nodelist_tab_content.size();
47
if
(searchterm.getBytes().length==searchterm.length())
48
{
49
for
(
int
i =
10
; i < length-
3
; i++) {
50
Node node_tab_content = nodelist_tab_content.elementAt(i);
51
Parser parser_tab_content =
new
Parser(node_tab_content
52
.toHtml());
53
TextExtractingVisitor visitor_tab_content =
new
TextExtractingVisitor();
54
parser_tab_content.visitAllNodesWith(visitor_tab_content);
55
text = text+
"/n"
+visitor_tab_content.getExtractedText().trim();
56
}
57
}
58
else
59
{
60
for
(
int
i =
8
; i < length-
3
; i++) {
61
Node node_tab_content = nodelist_tab_content.elementAt(i);
62
Parser parser_tab_content =
new
Parser(node_tab_content
63
.toHtml());
64
TextExtractingVisitor visitor_tab_content =
new
TextExtractingVisitor();
65
parser_tab_content.visitAllNodesWith(visitor_tab_content);
66
text = text+
"/n"
+visitor_tab_content.getExtractedText().trim();
67
}
68
}
69
text=text.replaceAll(
"相关搜索"
,
"相关搜索:"
);
70
text=text.replaceAll(
"null"
,
""
);
71
text=text.replaceAll(
"/n/n"
,
"/n"
);
72
text=text.replaceAll(
"/n/n"
,
"/n"
);
73
text=text.replaceAll(
"/n/n"
,
"/n"
);
74
 
75
System.out.println(
"-----------------------------------------"
+
76
"谷歌翻译-------------------------------------------"
);
77
System.out.println(uri);
78
 
79
System.out.println(text);
80
File f =
new
File(
"D://study/Java/GoogleDict/"
+ searchterm +
".txt"
);
81
FileWriter fw =
new
FileWriter(f);
82
fw.write(text);
83
fw.flush();
84
fw.close();
85
}
86
}
87
catch
(Exception e)
88
{
89
e.printStackTrace();
90
}
91
}
92
}
注:在使用HTMLParser处理google翻译返回的结果时,由于同时存在<span>...</span>,<div>...</div>,<span><div>...</div></span>三种标签,导致处理比较困难,个人对HTMLParser库也不是很熟悉,所以最终所得结果并不是很满意。

3.IcibaDict.java

view source

print?

01
package
MultiTread;
02
import
java.io.File;
03
import
java.io.FileWriter;
04
import
org.apache.http.HttpEntity;
05
import
org.apache.http.HttpResponse;
06
import
org.apache.http.client.HttpClient;
07
import
org.apache.http.client.methods.HttpGet;
08
import
org.apache.http.impl.client.DefaultHttpClient;
09
import
org.apache.http.util.EntityUtils;
10
import
org.htmlparser.Node;
11
import
org.htmlparser.NodeFilter;
12
import
org.htmlparser.Parser;
13
import
org.htmlparser.filters.AndFilter;
14
import
org.htmlparser.filters.HasAttributeFilter;
15
import
org.htmlparser.filters.TagNameFilter;
16
import
org.htmlparser.util.NodeList;
17
import
org.htmlparser.visitors.TextExtractingVisitor;
18
class
IcibaDict
extends
Thread
19
{
20
private
String searchterm=
null
;
21
public
IcibaDict(String input)
22
{
23
this
.searchterm=input;
24
}
25
public
void
run()
26
{
27
String text=
null
,webContent=
null
;
28
try
29
{
30
HttpClient httpclient =
new
DefaultHttpClient();
31
String searchstring =
"http://www.iciba.com/"
+ searchterm +
"/"
;
32
HttpGet httpget =
new
HttpGet(searchstring);
33
HttpResponse response = httpclient.execute(httpget);
34
HttpEntity entity = response.getEntity();
35
if
(entity !=
null
) {
36
String content=EntityUtils.toString(entity);
37
content=content.replaceAll(
"<a href"
,
"  <a href"
);
38
Parser parser =
new
Parser(content);
39
parser.setEncoding(
"gb2312"
);
40
NodeFilter filter_tab_content =
new
AndFilter(
new
TagNameFilter(
41
"div"
),
new
HasAttributeFilter(
"class"
,
"tab_content"
));
42
NodeList nodelist_tab_content = parser.parse(filter_tab_content);
43
int
length = nodelist_tab_content.size();
44
for
(
int
i =
0
; i < length; i++) {
45
Node node_tab_content = nodelist_tab_content.elementAt(i);
46
Parser parser_tab_content =
new
Parser(node_tab_content
47
.toHtml());
48
TextExtractingVisitor visitor_tab_content =
new
TextExtractingVisitor();
49
parser_tab_content.visitAllNodesWith(visitor_tab_content);
50
text = text+
"/n"
+visitor_tab_content.getExtractedText().trim();
51
}
52
parser.reset();
53
NodeFilter filter_web =
new
AndFilter(
new
TagNameFilter(
54
"div"
),
new
HasAttributeFilter(
"class"
,
"content_block"
));
55
NodeList nodelist_web = parser.parse(filter_web);
56
Node node_web = nodelist_web.elementAt(
0
);
57
if
(node_web!=
null
)
58
{
59
Parser parser_web =
new
Parser(node_web.toHtml());
60
TextExtractingVisitor visitor_web =
new
TextExtractingVisitor();
61
parser_web.visitAllNodesWith(visitor_web);
62
webContent=visitor_web.getExtractedText().trim();
63
}
64
text=text+webContent;
65
text=text.replaceAll(
"  "
,
""
);
66
text=text.replaceAll(
"  "
,
""
);
67
text=text.replaceAll(
"  "
,
"/n"
);
68
text=text.replaceAll(
"/n/n/n"
,
"/n"
);
69
text=text.replaceAll(
"/n/n"
,
"/n"
);
70
text=text.replaceAll(
"/n/n"
,
"/n"
);
71
text=text.replaceAll(
"  "
,
""
);
72
text=text.replace(
"null"
,
""
);
73
text=text.replace(
"相关搜索"
,
""
);
74
text=text.replace(
"句库"
,
""
);
75
text=text.replace(
"韦氏词典"
,
""
);
76
text=text.replace(
"Dictionary"
,
""
);
77
 
78
System.out.println(
"*************************************"
+
79
"爱词霸翻译*************************************"
);
80
System.out.println(searchstring);
81
 
82
System.out.println(text);
83
 
84
File f =
new
File(
"D://study/Java/IcibaDict/"
+ searchterm +
".txt"
);
85
FileWriter fw =
new
FileWriter(f);
86
fw.write(text);
87
fw.flush();
88
fw.close();
89
}
90
}
91
catch
(Exception e)
92
{
93
e.printStackTrace();
94
}
95
}
96
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: