您的位置:首页 > 数据库

关于SQL SERVER 日志满的处理方法

2009-07-10 09:29 447 查看
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.HashMap;

import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;

public class Crawler {

public static void main(String[] args) {
String url = "http://www.sina.com.cn";
getLinksByNeko(getPage(url, "gbk"));
}
public static String getPage(String url, String encoding){
BufferedReader in = null;
try {
in = new BufferedReader(new InputStreamReader(
new URL(url).openStream(),encoding));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
CharBuffer bos = CharBuffer.allocate(20480);
StringBuilder sb = new StringBuilder();
try {
while (in.read(bos) != -1) {
bos.flip();
sb.append(bos.toString());
}
} catch (IOException e1) {
e1.printStackTrace();
}finally{
if(in != null)
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
private static void getLinksByNeko(String Page) {
DOMParser parser = new DOMParser();
HashMap<String,String> map = new HashMap<String,String>();
try {
parser.setFeature("http://xml.org/sax/features/namespaces", false);
StringReader sin = new StringReader(Page);
parser.parse(new InputSource(sin));
org.w3c.dom.Document doc = parser.getDocument();
org.w3c.dom.NodeList products = org.apache.xpath.XPathAPI
.selectNodeList(doc, "//A");
org.w3c.dom.Node node = null;
for (int i = 0; i < products.getLength(); i++) {
node = products.item(i);
map.put(node.getAttributes().getNamedItem("href").getNodeValue(),node.getTextContent());
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(map.toString());
System.out.println(map.size());
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: