您的位置:首页 > 其它

利用AC自动机进行关键字的提取和过滤

2015-02-01 15:50 676 查看


昨天看了meituan.com的AC算法在美团上单系统的应用一文,深受启发,原来ACM算法在工程中也能有这样赤裸裸的运用~~~ 于是便复习了AC自动机,并把代码用java重新搞了一遍~~

AC自动机整体的结果大概是长这样的,其实就是在trie树上做KMP :



AC自动机里面比较难理解的应该是它的失配指针的计算过程。
这个计算过程从本质上讲就是进行一遍广搜,于此同时维护
fail指针,每一步的维护过程可用下图表示。



Keyword.java

package com.AC.domain;

import java.io.*;
import java.util.*;
import java.math.*;

public class Keyword implements Serializable{

/**
*
*/

private Integer id;
private Map<Integer, Integer> categoryTypeMap;
private String word;
private List<Integer> categories;

private static final long serialVersionUID = 1L;

public Keyword(){
id = null;
categories=null;
categoryTypeMap=null;
word=null;
}

public Keyword(String key){
id = null;
categories=null;
categoryTypeMap=null;
word=key;
}

public Keyword(Keyword p){
this.categories=p.categories;
this.categoryTypeMap=p.categoryTypeMap;
this.id=p.id;
this.word=p.word;
}

@Override
public boolean equals(Object o) {
// TODO Auto-generated method stub

if (this == o) return true;
if(o==null||getClass()!=o.getClass()) return false;

Keyword keyword = (Keyword) o;

if(id!=null?!id.equals(keyword.id):keyword.id!=null)
return false;

return true;
}
@Override
public int hashCode() {
// TODO Auto-generated method stub
return id != null ?id.hashCode():0;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public Map<Integer, Integer> getCategoryTypeMap() {
return categoryTypeMap;
}
public void setCategoryTypeMap(Map<Integer, Integer> categoryTypeMap) {
this.categoryTypeMap = categoryTypeMap;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public List<Integer> getCategories() {
return categories;
}
public void setCategories(List<Integer> categories) {
this.categories = categories;
}

}


Node.java

package com.AC.domain;

import java.util.ArrayList;
import java.util.List;

public class Node {
public Integer state;
public char  character = 0;  //鎸囧悜褰撳墠鑺傜偣鐨勫瓧绗�
public Node failureNode;
public List <Keyword> keywords;
public List <Node> childrenList;

public Node(){
keywords=new ArrayList<Keyword>();
childrenList = new ArrayList<Node>();
state = 0;
failureNode = null;
character = 0;
}

public Node (char c,Node node) {
keywords=new ArrayList<Keyword>();
childrenList = new ArrayList<Node>();
state =1;
character =c ;
failureNode = node;
}

public Boolean containsChild (char c){
for(Node childNode : childrenList) {
if(childNode.character==c) return true;
}
return false;
}

public Node getChild (char c){
for (Node childNode : childrenList){
if(childNode.character==c) return childNode;
}
return null;
}

public void addKeyword(Keyword keyword){
keywords.add(keyword);

}

public void addKeywords(List<Keyword> k){
keywords.addAll(k);
}

public void addChild(Node child){
childrenList.add(child);
}

}


Patterns.java
package com.AC.domain;

import java.util.*;
import java.io.*;
import java.math.*;

public class Patterns {
private final Node root = new Node();

private List<Node> tree;

public Patterns(List<Keyword> keywords){
tree = new ArrayList<Node> ();
root.failureNode=root;
tree.add(root);
for(Keyword keyword : keywords){
addKeyword(keyword);
}
setFailNode();
}

private  void setFailNode() {
// TODO Auto-generated method stub

Queue<Node> queue = new LinkedList<Node>();
Node node =root;
for (Node d1 : node.childrenList){
queue.offer(d1);
}
while (!queue.isEmpty()){
node = queue.poll();
if (node.childrenList!=null){
for (Node curNode : node.childrenList) {
queue.offer(curNode);
Node failNode = node.failureNode;
while(!failNode.containsChild(curNode.character)){
failNode = failNode.failureNode;
if(failNode==null||failNode.state==0) break;
}
if(failNode!=null&&failNode.containsChild(curNode.character)) {
curNode.failureNode = failNode.getChild(curNode.character);
curNode.addKeywords(curNode.failureNode.keywords);

}

}
}
}
}

private  void addKeyword(Keyword keyword) {
// TODO Auto-generated method stub

char [] wordCharArr = keyword.getWord().toCharArray();
Node current = root;
for(char currentChar : wordCharArr){
if(current.containsChild(currentChar)){
current = current.getChild(currentChar);
}
else{
Node node = new Node (currentChar,root);
current.addChild(node);
current=node;
tree.add(node);
}
}
current.addKeyword(keyword);

}

public List<Keyword> searchKeyword(String data,Integer category) {
List<Keyword> matchResult = new ArrayList<Keyword>();
Node node = root;
char[] chs = data.toCharArray();
for (int i=0;i<chs.length;i++){
while(node!=null&&!node.containsChild(chs[i])){
//	if(node.state==0) break;
node = node.failureNode;
if(node==null||node.state==0) break;
}

if(node!=null&&node.containsChild(chs[i])) {
node = node.getChild(chs[i]);
if(node.keywords!=null){
for(Keyword pattern : node.keywords){
if(category == null){
//						System.out.println(pattern.getWord());
matchResult.add(new Keyword(pattern.getWord()));
}
else{
if(pattern.getCategories().contains(category)){
matchResult.add(pattern);
}
}

}
}
}
}
return matchResult;
}

}


Test.java

package com.AC.domain;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Test {
public static void main(String []args){

//	abcd abc abe ae bc be bce cm kcabcmgh

List<Keyword> keywords = new ArrayList<Keyword>();
List<Keyword> result = new ArrayList<Keyword> ();

/*		List<Keyword> re= new ArrayList<Keyword> ();
re.clear();
Keyword a= new Keyword("abcd");
re.add(a);
Keyword b= new Keyword("abc");
re.add(b);

System.out.println(re.size());*/

Keyword a1= new Keyword();
a1.setWord("abcd");
keywords.add(a1);

Keyword a2= new Keyword();
a2.setWord("abc");
keywords.add(a2);

Keyword a3= new Keyword();
a3.setWord("abe");
keywords.add(a3);

Keyword a5= new Keyword();
a5.setWord("ae");
keywords.add(a5);

Keyword a6= new Keyword();
a6.setWord("bc");
keywords.add(a6);

Keyword a7= new Keyword();
a7.setWord("be");
keywords.add(a7);

Keyword a8= new Keyword();
a8.setWord("bce");
keywords.add(a8);

Keyword a9= new Keyword();
a9.setWord("cm");
keywords.add(a9);

Patterns patterns=new Patterns(keywords);
result=patterns.searchKeyword("kcabcmgha", null);

//		System.out.println(result.size());
System.out.println("keys: ");
for(Keyword key:result){
System.out.println(key.getWord());
}

//	System.out.println(result);
}

}


附美团文章链接:http://tech.meituan.com/ac.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: