您的位置:首页 > 其它

apriori算法

2015-09-16 23:08 405 查看
这个算法用于挖取关联规则,主要用于推荐系统

下面这个代码是我分析web日志的时候用的,日志从索引里读。主要功能是产生频繁项。

package test;

import java.io.File;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import model.Session;

/*
* author zlx
* date 2015-9-15
* */

public class Apriori {
private static String indexFilePath="./indexFile/webLog";
private static int minSupport=2;
public static void main(String[] args){}
//最大向前会话
public List<List<String>> maxForward(List<Session> list){
List<List<String>> result=new ArrayList<List<String>>();
for(Session s : list){
String[] mr=new String[20];
int count=0,flag=-1;
for(String url : s.getList()){
for(int i=0;i<count;i++){
if(url==mr[i]){
flag=i;
break;}
}
if(flag!=-1&&count>=2){
result.add(Arrays.asList(Arrays.copyOfRange(mr,0,count)));
mr=Arrays.copyOfRange(mr,0,flag+1);
count=flag+1;
flag=-1;
}
else
mr[count++]=url;
}
if(count>=1)
result.add(Arrays.asList(Arrays.copyOfRange(mr,0,count)));
}
return result;
}
//计算频繁向集
public List<Set<String>> frequentItemSet(List<List<String>> list){
List<Set<String>> urlSets=countUrl(list,2);
List<Set<String>> result=new ArrayList<Set<String>>();
result.addAll(urlSets);
while(urlSets.size()>0){
urlSets=getNextFrequentItemSets(urlSets,list);
result.addAll(urlSets);
}
return result;
}
//获取初始一维频繁向集
public List<Set<String>> countUrl(List<List<String>> list,int minsupport){
Map<String,Long> map=new HashMap<String,Long>();
List<Set<String>> urlSet=new ArrayList<Set<String>>();
for(List<String> l : list){
for(String  s : l){
if(map.get(s)==null)
map.put(s,new Long(1));
else
map.put(s,map.get(s)+1);
}
}

Set<Entry<String,Long>> entrySet=map.entrySet();
for(Entry<String,Long> entry : entrySet){
if(entry.getValue()>minsupport){
Set<String> set=new HashSet<String>();
set.add(entry.getKey());
urlSet.add(set);
}
}
return urlSet;
}
//自交
public List<Set<String>> getNextFrequentItemSets(List<Set<String>> list,List<List<String>> dic){
List<Set<String>> urlSets=new ArrayList<Set<String>>();
for(int i=0;i<list.size()-1;i++){
for(int j=i+1;j<list.size();j++){
if(check(list.get(i),list.get(j))){
list.get(i).addAll(list.get(j));
urlSets.add(list.get(i));
}
}
}
if(urlSets.size()>0)
cutFrequentSets(urlSets,list);
if(urlSets.size()>0)
countSupport(urlSets,dic);
return urlSets;
}
//剪枝
public void cutFrequentSets(List<Set<String>> l1,List<Set<String>> l2){
for(Set<String> s : l1){
for(String str : s){
s.remove(str);
if(!l2.contains(s)){
l1.remove(s.add(str));
break;
}
s.add(str);
}
}
}
//求满足最小支持度的频繁集
public void countSupport(List<Set<String>> l1,List<List<String>> l2){
for(Set<String> set : l1){
List tmp=Arrays.asList(set.toArray());
int count=0;
for(List<String> list : l2){
if(list.containsAll(tmp))
count++;
}
if(count<minSupport)
l1.remove(set);
}
return ;
}
//检查两个集合是能够结合
public boolean check(Set<String> set1,Set<String> set2){
int flag=0;

for(String s: set1){
if(set2.contains(s))
flag++;
}
if(flag==set1.size()-1)
return true;
else
return false;
}
//用户会话识别
public List<Session> getSession() throws CorruptIndexException, IOException, ParseException, java.text.ParseException{
List<Session> list=new ArrayList<Session>();
IndexSearcher searcher=new IndexSearcher(FSDirectory.open(new File(indexFilePath)));
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_33);
String[] queryString=new String[1];
String[] fields=new String[1];
BooleanClause.Occur[] flags=new BooleanClause.Occur[1];
queryString[0]="tomcat";
fields[0]="type";
flags[0]=BooleanClause.Occur.MUST;
Query query=MultiFieldQueryParser.parse(Version.LUCENE_33,queryString,fields,flags,analyzer);
Sort sort=new Sort(new SortField("DateForSort",SortField.LONG,false));
TopDocs result=searcher.search(query, 10000,sort);
ScoreDoc[] hit=result.scoreDocs;
Map<String,Long> map=new HashMap<String,Long>();
Map<String,Session> map1=new HashMap<String,Session>();
for(int i=0;i<hit.length;i++){
Document doc=searcher.doc(hit[i].doc);
String ip=doc.get("ip");
String str=doc.get("date");
String url=doc.get("requset");
Date date=new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US).parse(str);
Long pretime=map.get(ip);
if(pretime==null||((pretime-date.getTime())/(1000*60)>30)){
if(pretime==null)
map.put(ip,date.getTime());
Session session=new Session();
session.setSes(UUID.randomUUID().toString());
session.getList().add(url);
list.add(session);
map1.put(ip,session);
}
else{
Session session=map1.get(ip);
session.getList().add(url);

}
}
return list;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: