Java程序获取网页源代码
2013-05-27 17:58
435 查看
import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.util.Date; import java.util.zip.GZIPInputStream; public class TestHtmlCode { /** * 显示页面信息总控制类 * @param args */ void display(String urlString){ try { URL url = new URL(urlString); // 创建URLConnection对象,用URL的openConnection方法将连接通过返回给URLConnection的对象 // 实际上URL的openConnection的返回值就是一个URLConnection URLConnection c = url.openConnection(); // * // 用URLConnection的connect()方法建立连接 c.connect(); // * // 显示该连接的相关信息,这些都是URLConnection的方法 System.out.println("编码:" + c.getContentEncoding()); System.out.println("内容类型: " + c.getContentType()); System.out.println("内容长度: " + c.getContentLength()); System.out.println("创建日期: " + new Date(c.getDate())); System.out.println("最后修改日期: " + new Date(c.getLastModified())); System.out.println("终止日期: " + new Date(c.getExpiration())); if ("gzip".equals(c.getContentEncoding())) { this.doGzipHtml(c); return ; } this.doSimpleHtml(c); } catch (IOException e) { System.out.println(e); } } /** * 如果contentEcoding为gzip, 则用GZIPInputStream读源文件 * @param c */ private void doGzipHtml(URLConnection c){ try { /* GZIPInputStream is1 = new GZIPInputStream(c.getInputStream()); int n1; byte all[] = new byte[100000]; //1000000这个参数可能不合适 n1 = is1.available(); byte ko1[] = new byte[n1]; is1.read(ko1); int num1; int j = 0; while ((num1 = is1.read(ko1, 0, ko1.length)) != -1) { for (int i = 0; i < ko1.length; i++) { all[j] = ko1[i]; j++; } } System.out.println(all.length); System.out.println(new String(all, "UTF-8")); BufferedOutputStream out = new BufferedOutputStream( new FileOutputStream("d:\\category.txt")); out.write(all, 0, all.length);*/ GZIPInputStream is1 = new GZIPInputStream(c.getInputStream()); BufferedReader br; BufferedWriter bw; br = new BufferedReader(new InputStreamReader(is1)); bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("d:\\category.txt"))); //会同时写入到这个文件里 String str = null; while ((str = br.readLine()) != null) { String newStr = new String(str.getBytes(), "utf8"); System.out.println(newStr); bw.write(newStr); bw.flush(); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 如果contentEcoding为null, 则用InputStream读源文件 * @param c */ private void doSimpleHtml(URLConnection c){ BufferedReader br; BufferedWriter bw; try { br = new BufferedReader(new InputStreamReader(c.getInputStream())); bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("d:\\category.txt"))); //会同时写入到这个文件里 String str = null; while ((str = br.readLine()) != null) { String newStr = new String(str.getBytes(), "gb2312"); System.out.println(newStr); bw.write(newStr); bw.flush(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { String urlString = "http://www.baidu.com"; //网址 new TestHtmlCode().display(urlString); } }
相关文章推荐
- Java 网络爬虫获取网页源代码原理及实现
- [JavaWeb]JavaSocket获取网页源代码遇见的问题
- Java--使用net包获取网页源代码
- [JavaWeb]JavaSocket获取网页源代码遇见的问题
- JAVA获取网页中的电影下载地址小程序
- java网络连接搭配apache,java HttpClient获取网页源代码
- win C/C++程序通过Get方式获取网页源代码
- 如何用Java获取网页源代码
- java获取特定网页的源代码
- Java获取任意http网页源代码的方法分享
- 第一个java小程序 applet 画一个圆 获取网页自定义的参数来输出
- java 获取网页源代码
- java获取响应网页源代码
- Java 网络爬虫获取网页源代码原理及实现
- Java 网络爬虫获取网页源代码原理及实现
- java 获取网页源代码 (防乱码)
- Java 网络爬虫获取网页源代码原理及实现
- Java 网络爬虫获取网页源代码原理及实现
- 黑马程序员_JAVA获取网页源代码
- java 获取网页源代码