您的位置:首页 > 其它

DFA算法过滤敏感词,替换为*

2012-08-14 20:11 316 查看
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Properties;

public class test {
/**
* 根节点
*/
private TreeNode rootNode = new TreeNode();

/**
* 关键词缓存
*/
private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024);

/**
* 关键词编码
*/
private String charset = "utf-8";

/**
* 创建DFA
* @param keywordList
* @throws UnsupportedEncodingException
*/
public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException{
for (String keyword : keywordList) {
if(keyword == null) continue;
keyword = keyword.trim();
byte[] bytes = keyword.getBytes(charset);
TreeNode tempNode = rootNode;
for (int i = 0; i < bytes.length; i++) {
int index = bytes[i] & 0xff;
TreeNode node = tempNode.getSubNode(index);
if(node == null){
node = new TreeNode();
tempNode.setSubNode(index, node);
}
tempNode = node;
if(i == bytes.length - 1){
tempNode.setKeywordEnd(true);
}
}
}
}

public String searchKeyword(String text) throws UnsupportedEncodingException{
return searchKeyword(text.getBytes(charset));
}

public String searchKeyword(byte[] bytes){
StringBuilder words = new StringBuilder();
if(bytes == null || bytes.length == 0){
return words.toString();
}
TreeNode tempNode = rootNode;
int rollback = 0;
int position = 0;
while (position < bytes.length) {
int index = bytes[position] & 0xFF;
keywordBuffer.put(bytes[position]);
tempNode = tempNode.getSubNode(index);
if(tempNode == null){
position = position - rollback;
rollback = 0;
tempNode = rootNode;
keywordBuffer.clear();
}
else if(tempNode.isKeywordEnd()){
keywordBuffer.flip();
for (int i = 0; i <= rollback; i++) {
bytes[position-i] = 42;
}
keywordBuffer.limit(keywordBuffer.capacity());
rollback = 1;
}else{
rollback++;
}
position++;
}
String result = null;
try {
result  =  new String(bytes,"utf-8");

} catch (Exception e) {
e.printStackTrace();
}
return result;
}

public void setCharset(String charset) {
this.charset = charset;
}
}


import java.util.ArrayList;
import java.util.List;

public class TreeNode {
private static final int NODE_LEN = 256;

/**
* true 关键词的终结 ; false 继续
*/
private boolean end = false;

private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);

public TreeNode(){
for (int i = 0; i < NODE_LEN; i++) {
subNodes.add(i, null);
}
}

/**
* 向指定位置添加节点树
* @param index
* @param node
*/
public void setSubNode(int index, TreeNode node){
subNodes.set(index, node);
}

public TreeNode getSubNode(int index){
return subNodes.get(index);
}

public boolean isKeywordEnd() {
return end;
}

public void setKeywordEnd(boolean end) {
this.end = end;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: