您的位置:首页 > 编程语言 > Java开发

Java 爬虫

2015-06-07 14:28 393 查看
Java 的一个网页爬虫

package calcium.tools.grex;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MyGrex {

public static void main(String[] args) throws IOException {

PrintWriter pw = new PrintWriter(new FileWriter("F:a.txt"));
StringBuffer buf = new StringBuffer(1024 * 1024);
String html = "F:a.html";
readTxtFile(html, buf);
//		String html = "http://bbs.zhiyoo.com/forum.php?mod=modcp&action=thread&op=thread";
//		readHtml(html, buf);

List<String> a = new ArrayList<String>();
a = getLink(buf.toString());
List<String> b = new ArrayList<String>();
b = getHuifu(buf.toString());
List<String> c = new ArrayList<String>();
c = getDianji(buf.toString());
getLinkl(buf.toString());

// Pattern pattern = Pattern.compile("href=\"(.+?)\"");
// Matcher matcher = pattern.matcher("<a href=\"index.html\">主页</a>");
// if(matcher.find()) {
// System.out.println(matcher.group(1));
// }
for (int i = 0; i < a.size(); i++) {
System.out.println(a.get(i));
pw.write(a.get(i));
pw.write("V");
pw.write(b.get(i));
pw.write("V");
pw.write(c.get(i));
pw.write("\n");
}
pw.close();
}

public static List<String> getLink(String html) throws IOException {

Pattern p = Pattern
.compile("<a href=\"http://bbs.zhiyoo.com/forum.php"
+ "\\?mod=viewthread&tid=[0-9]{7}\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>"
+ "(.*?)</a>");
Matcher m = p.matcher(html);

ArrayList<String> alist = new ArrayList<String>();
while (m.find()) {
alist.add(m.group(2));
System.out.println(m.group(2));
}
return alist;
}

// 中文:[^x00-xff]*
public static List<String> getLinkl(String html) throws IOException {

Pattern p = Pattern
.compile("<span id=\"thread_[0-9]{7}\"><a href=\"(.*?)\" target=\"_blank\"(| style=\"font-weight: bold;color: #8F2A90\")>[\\s\\S]</a><span>");
Matcher m = p.matcher(html);

ArrayList<String> alist = new ArrayList<String>();
while (m.find()) {
alist.add(m.group(1));
System.out.println(m.group(1));
System.out.println(m.group());

}
return alist;
}

public static List<String> getHuifu(String s) {
// <span class="xi2">31</span><em>374</em><a.*?</a>
String regex = "<span class=\"xi2\">(.+?)</span>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
List<String> blist = new ArrayList<String>();
while (ma.find()) {
blist.add(ma.group(1));
System.out.println(ma.group(1));
}
return blist;
}

public static List<String> getDianji(String s) {
// <td class="num"><span class="xi2">25</span><em>504</em></td>
String regex = "<td class=\"num\"><span class=\"xi2\">[0-9]{2}</span><em>(.+?)</em></td>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
List<String> clist = new ArrayList<String>();
while (ma.find()) {
clist.add(ma.group(1));
System.out.println(ma.group(1));
}
return clist;
}

public static void readTxtFile(String filePath, StringBuffer buf) {
try {
String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()) { // �ж��ļ��Ƿ����
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);// ���ǵ������ʽ
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
// System.out.println(lineTxt);
buf.append(lineTxt);
}
read.close();
} else {
System.out.println("�Ҳ���ָ�����ļ�");
}
} catch (Exception e) {
System.out.println("��ȡ�ļ����ݳ���");
e.printStackTrace();
}

}

public static void readHtml(String html,StringBuffer buf) {
try {
String encoding = "GBK";
URL url = new URL(html);
InputStreamReader read = new InputStreamReader(url.openStream(),encoding);
BufferedReader bufferedReader = new BufferedReader(read);
while (bufferedReader.readLine() != null) {
String s = bufferedReader.readLine();
buf.append(s);
System.out.println(s);
}
read.close();
} catch (Exception e) {
// TODO: handle exception
System.out.println("null");
}

}<strong>
}
</strong>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: