您的位置：首页 > 编程语言 > Java开发

JAVA过滤html标签

2013-12-27 17:41 363 查看

过滤URL网址，邮箱地址，html标签，JS代码，各种转义字符：

public static String killTags(String news) {

if(news==null){
return "";
}
String s = news.replaceAll("amp;", "");
if(s==null){
return "";
}
s =s.replaceAll("<", "<");
if(s==null){
return "";
}
s =s.replaceAll(">", ">");
if(s==null){
return "";
}

/*
* 过滤CSS样式
*/
Pattern pattern = Pattern.compile(
"<(span)?(\\s)*style.*?style>|<(span)?(\\s)*style=.*?>",
Pattern.DOTALL);
Matcher matcher = pattern.matcher(s);
String str = matcher.replaceAll("");

/*
* 过滤HTML标签
*/
Pattern pattern2 = Pattern.compile("(<[^>]+>)", Pattern.DOTALL);
Matcher matcher2 = pattern2.matcher(str);
String strhttp = matcher2.replaceAll(" ");

/*
* 过滤URL网址
*/

String regEx = "(((http|https|ftp)(\\s)*((\\:)|：))(\\s)*(//|／／)(\\s)*)?"
+ "([\\sa-zA-Z0-9(\\.|．)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|．)&%\\$\\-]+)*@(\\s)*)?"
+ "("
+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"
+ "(\\.|．)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|．)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|．)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"
+ "|([\\sa-zA-Z0-9\\-]+(\\.|．)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|．)(\\s)*[\\sa-zA-Z]*"
+ ")"
+ "((\\s)*(\\:)|(：)(\\s)*[0-9]+)?"
+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";

Pattern p1 = Pattern.compile(regEx, Pattern.DOTALL);

String[] subs = strhttp.split(" ");
StringBuffer buf = new StringBuffer();
for(String strElement:subs){
Matcher matchhttp = p1.matcher(strElement);
String temp = matchhttp.replaceAll("");
buf.append(temp);
buf.append(" ");
}
String strnew = buf.toString().replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");

/*
* 过滤标点符号
*/
Pattern patterncomma = Pattern.compile("(&[^;]+;)", Pattern.DOTALL);
Matcher matchercomma = patterncomma.matcher(strnew);
String strout = matchercomma.replaceAll(" ").replaceAll("\\pP", " ");
return strout;
}

JAVA去除新闻来源和记者信息（需要用到ANSJ进行词性判断）：

// delete where news come from
public static String killFrom(String content) {
if (content == null) {
return "";
}
String answer = "";
String reg = "(^| )[\u4E00-\u9FA5]*(网|社|报)[\\S]*(报道|电|讯)";
answer = content.replaceAll(reg, "");

reg = "(^| )[\\S]*(通讯员|记者)[\\S]*";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(answer);
int flag = 0;
String match = "";
int start = 0;
int end = 0;
while (matcher.find()) {
start = matcher.start();
end = matcher.end();
match = answer.substring(start, end);
List<Term> parse = ToAnalysis.parse(match);
for (Term t : parse) {
if ("nr".equals(t.getNatrue().natureStr)
|| "nrf".equals(t.getNatrue().natureStr)) {
flag++;
}
}
if (flag > 0) {
break;
}
}

String left = answer.substring(0, start);
String right = answer.substring(end, answer.length());
if (flag > 0 && (end - start <= 5 * flag + 5)) {
answer = left + " " + right;

}
return answer;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航