Java 爬虫
2015-06-07 14:28
393 查看
Java 的一个网页爬虫
package calcium.tools.grex; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MyGrex { public static void main(String[] args) throws IOException { PrintWriter pw = new PrintWriter(new FileWriter("F:a.txt")); StringBuffer buf = new StringBuffer(1024 * 1024); String html = "F:a.html"; readTxtFile(html, buf); // String html = "http://bbs.zhiyoo.com/forum.php?mod=modcp&action=thread&op=thread"; // readHtml(html, buf); List<String> a = new ArrayList<String>(); a = getLink(buf.toString()); List<String> b = new ArrayList<String>(); b = getHuifu(buf.toString()); List<String> c = new ArrayList<String>(); c = getDianji(buf.toString()); getLinkl(buf.toString()); // Pattern pattern = Pattern.compile("href=\"(.+?)\""); // Matcher matcher = pattern.matcher("<a href=\"index.html\">主页</a>"); // if(matcher.find()) { // System.out.println(matcher.group(1)); // } for (int i = 0; i < a.size(); i++) { System.out.println(a.get(i)); pw.write(a.get(i)); pw.write("V"); pw.write(b.get(i)); pw.write("V"); pw.write(c.get(i)); pw.write("\n"); } pw.close(); } public static List<String> getLink(String html) throws IOException { Pattern p = Pattern .compile("<a href=\"http://bbs.zhiyoo.com/forum.php" + "\\?mod=viewthread&tid=[0-9]{7}\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>" + "(.*?)</a>"); Matcher m = p.matcher(html); ArrayList<String> alist = new ArrayList<String>(); while (m.find()) { alist.add(m.group(2)); System.out.println(m.group(2)); } return alist; } // 中文:[^x00-xff]* public static List<String> getLinkl(String html) throws IOException { Pattern p = Pattern .compile("<span id=\"thread_[0-9]{7}\"><a href=\"(.*?)\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>[\\s\\S]</a><span>"); Matcher m = p.matcher(html); ArrayList<String> alist = new ArrayList<String>(); while (m.find()) { alist.add(m.group(1)); System.out.println(m.group(1)); System.out.println(m.group()); } return alist; } public static List<String> getHuifu(String s) { // <span class="xi2">31</span><em>374</em><a.*?</a> String regex = "<span class=\"xi2\">(.+?)</span>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); List<String> blist = new ArrayList<String>(); while (ma.find()) { blist.add(ma.group(1)); System.out.println(ma.group(1)); } return blist; } public static List<String> getDianji(String s) { // <td class="num"><span class="xi2">25</span><em>504</em></td> String regex = "<td class=\"num\"><span class=\"xi2\">[0-9]{2}</span><em>(.+?)</em></td>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); List<String> clist = new ArrayList<String>(); while (ma.find()) { clist.add(ma.group(1)); System.out.println(ma.group(1)); } return clist; } public static void readTxtFile(String filePath, StringBuffer buf) { try { String encoding = "GBK"; File file = new File(filePath); if (file.isFile() && file.exists()) { // �ж��ļ��Ƿ���� InputStreamReader read = new InputStreamReader( new FileInputStream(file), encoding);// ���ǵ������ʽ BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; while ((lineTxt = bufferedReader.readLine()) != null) { // System.out.println(lineTxt); buf.append(lineTxt); } read.close(); } else { System.out.println("�Ҳ���ָ�����ļ�"); } } catch (Exception e) { System.out.println("��ȡ�ļ����ݳ���"); e.printStackTrace(); } } public static void readHtml(String html,StringBuffer buf) { try { String encoding = "GBK"; URL url = new URL(html); InputStreamReader read = new InputStreamReader(url.openStream(),encoding); BufferedReader bufferedReader = new BufferedReader(read); while (bufferedReader.readLine() != null) { String s = bufferedReader.readLine(); buf.append(s); System.out.println(s); } read.close(); } catch (Exception e) { // TODO: handle exception System.out.println("null"); } }<strong> } </strong>
相关文章推荐
- java输入输出高速
- Java枚举类型
- Java Map 迭代
- eclipse Maven 配置
- Struts2中的ModelDriven机制及其运用
- ubuntu下安装配置JDK
- Windows在结构Eclipse+Android4.0开发环境
- springMVC学习笔记汇总(持续更新)
- Java语言描述:递归与分治策略之全排列问题
- Java中static关键字的作用及用法
- Java内存以及回收机制
- 简单的初学者配置Android SDK+ADT+Eclipse
- 如何终止java线程
- 为什么Java里的Arrays.asList不能用add和remove方法?
- 常见的Java的软件包
- SpringMVC 学习笔记(十一) SpirngMVC运行流程
- Eclipse 关联源码 ( src.zip)
- Eclipse 关联中文 api
- java中各种数据类型(非基本类型)的大小比较
- java中 this和super的区别