JAVA过滤html标签
2013-12-27 17:41
363 查看
过滤URL网址,邮箱地址,html标签,JS代码,各种转义字符:
public static String killTags(String news) {
if(news==null){
return "";
}
String s = news.replaceAll("amp;", "");
if(s==null){
return "";
}
s =s.replaceAll("<", "<");
if(s==null){
return "";
}
s =s.replaceAll(">", ">");
if(s==null){
return "";
}
/*
* 过滤CSS样式
*/
Pattern pattern = Pattern.compile(
"<(span)?(\\s)*style.*?style>|<(span)?(\\s)*style=.*?>",
Pattern.DOTALL);
Matcher matcher = pattern.matcher(s);
String str = matcher.replaceAll("");
/*
* 过滤HTML标签
*/
Pattern pattern2 = Pattern.compile("(<[^>]+>)", Pattern.DOTALL);
Matcher matcher2 = pattern2.matcher(str);
String strhttp = matcher2.replaceAll(" ");
/*
* 过滤URL网址
*/
String regEx = "(((http|https|ftp)(\\s)*((\\:)|:))(\\s)*(//|//)(\\s)*)?"
+ "([\\sa-zA-Z0-9(\\.|.)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|.)&%\\$\\-]+)*@(\\s)*)?"
+ "("
+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"
+ "|([\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*[\\sa-zA-Z]*"
+ ")"
+ "((\\s)*(\\:)|(:)(\\s)*[0-9]+)?"
+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";
Pattern p1 = Pattern.compile(regEx, Pattern.DOTALL);
String[] subs = strhttp.split(" ");
StringBuffer buf = new StringBuffer();
for(String strElement:subs){
Matcher matchhttp = p1.matcher(strElement);
String temp = matchhttp.replaceAll("");
buf.append(temp);
buf.append(" ");
}
String strnew = buf.toString().replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");
/*
* 过滤标点符号
*/
Pattern patterncomma = Pattern.compile("(&[^;]+;)", Pattern.DOTALL);
Matcher matchercomma = patterncomma.matcher(strnew);
String strout = matchercomma.replaceAll(" ").replaceAll("\\pP", " ");
return strout;
}
JAVA去除新闻来源和记者信息(需要用到ANSJ进行词性判断):
public static String killTags(String news) {
if(news==null){
return "";
}
String s = news.replaceAll("amp;", "");
if(s==null){
return "";
}
s =s.replaceAll("<", "<");
if(s==null){
return "";
}
s =s.replaceAll(">", ">");
if(s==null){
return "";
}
/*
* 过滤CSS样式
*/
Pattern pattern = Pattern.compile(
"<(span)?(\\s)*style.*?style>|<(span)?(\\s)*style=.*?>",
Pattern.DOTALL);
Matcher matcher = pattern.matcher(s);
String str = matcher.replaceAll("");
/*
* 过滤HTML标签
*/
Pattern pattern2 = Pattern.compile("(<[^>]+>)", Pattern.DOTALL);
Matcher matcher2 = pattern2.matcher(str);
String strhttp = matcher2.replaceAll(" ");
/*
* 过滤URL网址
*/
String regEx = "(((http|https|ftp)(\\s)*((\\:)|:))(\\s)*(//|//)(\\s)*)?"
+ "([\\sa-zA-Z0-9(\\.|.)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|.)&%\\$\\-]+)*@(\\s)*)?"
+ "("
+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"
+ "|([\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*[\\sa-zA-Z]*"
+ ")"
+ "((\\s)*(\\:)|(:)(\\s)*[0-9]+)?"
+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";
Pattern p1 = Pattern.compile(regEx, Pattern.DOTALL);
String[] subs = strhttp.split(" ");
StringBuffer buf = new StringBuffer();
for(String strElement:subs){
Matcher matchhttp = p1.matcher(strElement);
String temp = matchhttp.replaceAll("");
buf.append(temp);
buf.append(" ");
}
String strnew = buf.toString().replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");
/*
* 过滤标点符号
*/
Pattern patterncomma = Pattern.compile("(&[^;]+;)", Pattern.DOTALL);
Matcher matchercomma = patterncomma.matcher(strnew);
String strout = matchercomma.replaceAll(" ").replaceAll("\\pP", " ");
return strout;
}
JAVA去除新闻来源和记者信息(需要用到ANSJ进行词性判断):
// delete where news come from public static String killFrom(String content) { if (content == null) { return ""; } String answer = ""; String reg = "(^| )[\u4E00-\u9FA5]*(网|社|报)[\\S]*(报道|电|讯)"; answer = content.replaceAll(reg, ""); reg = "(^| )[\\S]*(通讯员|记者)[\\S]*"; Pattern pattern = Pattern.compile(reg); Matcher matcher = pattern.matcher(answer); int flag = 0; String match = ""; int start = 0; int end = 0; while (matcher.find()) { start = matcher.start(); end = matcher.end(); match = answer.substring(start, end); List<Term> parse = ToAnalysis.parse(match); for (Term t : parse) { if ("nr".equals(t.getNatrue().natureStr) || "nrf".equals(t.getNatrue().natureStr)) { flag++; } } if (flag > 0) { break; } } String left = answer.substring(0, start); String right = answer.substring(end, answer.length()); if (flag > 0 && (end - start <= 5 * flag + 5)) { answer = left + " " + right; } return answer; }
相关文章推荐
- Java过滤html标签
- Java过滤HTML标签工具类
- java正则表达式 过滤特殊字符的正则表达式
- 过滤html标签(java)
- [Java] 使用htmlparser在爬虫时过滤网页
- Java解决在浏览器地址栏中输入url访问action的问题以及拦截方法过滤的简易实现
- Java过滤Unicode
- java String 中 startsWith() 对字符串开头进行过滤 endsWith()对字符串结尾进行过滤
- Java实现DFA算法对敏感词、广告词过滤功能
- Java实现敏感词过滤
- Java实现敏感词过滤
- transform实现广告计费日志实时黑名单过滤(Java版本)
- java过滤文章中的敏感词的小例题
- 好记性不如烂笔头31-java应用中的敏感词过滤实现(3)
- java中利用正则表达式过滤特殊字符
- java把过滤掉的 % 号拿回来
- java DFA 敏感词过滤
- JavaWeb-过滤器高级案例、URL过滤系统
- Java Map 通过 key 或者 value 过滤的实例代码
- java 反射提取类信息, 动态代理 和过滤某些方法演示