您的位置：首页 > 编程语言 > Java开发
Java程序获取网页源代码

2013-05-27 17:58 435 查看
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
import java.util.zip.GZIPInputStream;

public class TestHtmlCode {

/**
* 显示页面信息总控制类
* @param args
*/
void display(String urlString){
try {
URL url = new URL(urlString);
// 创建URLConnection对象，用URL的openConnection方法将连接通过返回给URLConnection的对象
// 实际上URL的openConnection的返回值就是一个URLConnection
URLConnection c = url.openConnection(); // *
// 用URLConnection的connect()方法建立连接
c.connect(); // *
// 显示该连接的相关信息，这些都是URLConnection的方法
System.out.println("编码:" + c.getContentEncoding());
System.out.println("内容类型: " + c.getContentType());
System.out.println("内容长度: " + c.getContentLength());
System.out.println("创建日期: " + new Date(c.getDate()));
System.out.println("最后修改日期: " + new Date(c.getLastModified()));
System.out.println("终止日期: " + new Date(c.getExpiration()));

if ("gzip".equals(c.getContentEncoding())) {
this.doGzipHtml(c);
return ;
}
this.doSimpleHtml(c);

} catch (IOException e) {
System.out.println(e);
}
}

/**
* 如果contentEcoding为gzip, 则用GZIPInputStream读源文件
* @param c
*/
private void doGzipHtml(URLConnection c){
try {
/*			GZIPInputStream is1 = new GZIPInputStream(c.getInputStream());
int n1;
byte all[] = new byte[100000]; //1000000这个参数可能不合适
n1 = is1.available();
byte ko1[] = new byte[n1];
is1.read(ko1);
int num1;
int j = 0;
while ((num1 = is1.read(ko1, 0, ko1.length)) != -1) {
for (int i = 0; i < ko1.length; i++) {
all[j] = ko1[i];
j++;
}

}
System.out.println(all.length);
System.out.println(new String(all, "UTF-8"));

BufferedOutputStream out = new BufferedOutputStream(
new FileOutputStream("d:\\category.txt"));
out.write(all, 0, all.length);*/

GZIPInputStream is1 = new GZIPInputStream(c.getInputStream());
BufferedReader br;
BufferedWriter bw;
br = new BufferedReader(new InputStreamReader(is1));
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("d:\\category.txt")));  //会同时写入到这个文件里
String str = null;
while ((str = br.readLine()) != null) {
String newStr = new String(str.getBytes(), "utf8");
System.out.println(newStr);
bw.write(newStr);
bw.flush();
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 如果contentEcoding为null, 则用InputStream读源文件
* @param c
*/
private void doSimpleHtml(URLConnection c){
BufferedReader br;
BufferedWriter bw;
try {
br = new BufferedReader(new InputStreamReader(c.getInputStream()));
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("d:\\category.txt")));  //会同时写入到这个文件里

String str = null;
while ((str = br.readLine()) != null) {
String newStr = new String(str.getBytes(), "gb2312");
System.out.println(newStr);
bw.write(newStr);
bw.flush();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

public static void main(String[] args) {
String urlString =
"http://www.baidu.com"; //网址
new TestHtmlCode().display(urlString);
}

}
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签： Java URL 网页源代码
相关文章推荐
新的分享
章节导航