apriori算法
2015-09-16 23:08
405 查看
这个算法用于挖取关联规则,主要用于推荐系统
下面这个代码是我分析web日志的时候用的,日志从索引里读。主要功能是产生频繁项。
下面这个代码是我分析web日志的时候用的,日志从索引里读。主要功能是产生频繁项。
package test; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.UUID; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import model.Session; /* * author zlx * date 2015-9-15 * */ public class Apriori { private static String indexFilePath="./indexFile/webLog"; private static int minSupport=2; public static void main(String[] args){} //最大向前会话 public List<List<String>> maxForward(List<Session> list){ List<List<String>> result=new ArrayList<List<String>>(); for(Session s : list){ String[] mr=new String[20]; int count=0,flag=-1; for(String url : s.getList()){ for(int i=0;i<count;i++){ if(url==mr[i]){ flag=i; break;} } if(flag!=-1&&count>=2){ result.add(Arrays.asList(Arrays.copyOfRange(mr,0,count))); mr=Arrays.copyOfRange(mr,0,flag+1); count=flag+1; flag=-1; } else mr[count++]=url; } if(count>=1) result.add(Arrays.asList(Arrays.copyOfRange(mr,0,count))); } return result; } //计算频繁向集 public List<Set<String>> frequentItemSet(List<List<String>> list){ List<Set<String>> urlSets=countUrl(list,2); List<Set<String>> result=new ArrayList<Set<String>>(); result.addAll(urlSets); while(urlSets.size()>0){ urlSets=getNextFrequentItemSets(urlSets,list); result.addAll(urlSets); } return result; } //获取初始一维频繁向集 public List<Set<String>> countUrl(List<List<String>> list,int minsupport){ Map<String,Long> map=new HashMap<String,Long>(); List<Set<String>> urlSet=new ArrayList<Set<String>>(); for(List<String> l : list){ for(String s : l){ if(map.get(s)==null) map.put(s,new Long(1)); else map.put(s,map.get(s)+1); } } Set<Entry<String,Long>> entrySet=map.entrySet(); for(Entry<String,Long> entry : entrySet){ if(entry.getValue()>minsupport){ Set<String> set=new HashSet<String>(); set.add(entry.getKey()); urlSet.add(set); } } return urlSet; } //自交 public List<Set<String>> getNextFrequentItemSets(List<Set<String>> list,List<List<String>> dic){ List<Set<String>> urlSets=new ArrayList<Set<String>>(); for(int i=0;i<list.size()-1;i++){ for(int j=i+1;j<list.size();j++){ if(check(list.get(i),list.get(j))){ list.get(i).addAll(list.get(j)); urlSets.add(list.get(i)); } } } if(urlSets.size()>0) cutFrequentSets(urlSets,list); if(urlSets.size()>0) countSupport(urlSets,dic); return urlSets; } //剪枝 public void cutFrequentSets(List<Set<String>> l1,List<Set<String>> l2){ for(Set<String> s : l1){ for(String str : s){ s.remove(str); if(!l2.contains(s)){ l1.remove(s.add(str)); break; } s.add(str); } } } //求满足最小支持度的频繁集 public void countSupport(List<Set<String>> l1,List<List<String>> l2){ for(Set<String> set : l1){ List tmp=Arrays.asList(set.toArray()); int count=0; for(List<String> list : l2){ if(list.containsAll(tmp)) count++; } if(count<minSupport) l1.remove(set); } return ; } //检查两个集合是能够结合 public boolean check(Set<String> set1,Set<String> set2){ int flag=0; for(String s: set1){ if(set2.contains(s)) flag++; } if(flag==set1.size()-1) return true; else return false; } //用户会话识别 public List<Session> getSession() throws CorruptIndexException, IOException, ParseException, java.text.ParseException{ List<Session> list=new ArrayList<Session>(); IndexSearcher searcher=new IndexSearcher(FSDirectory.open(new File(indexFilePath))); Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_33); String[] queryString=new String[1]; String[] fields=new String[1]; BooleanClause.Occur[] flags=new BooleanClause.Occur[1]; queryString[0]="tomcat"; fields[0]="type"; flags[0]=BooleanClause.Occur.MUST; Query query=MultiFieldQueryParser.parse(Version.LUCENE_33,queryString,fields,flags,analyzer); Sort sort=new Sort(new SortField("DateForSort",SortField.LONG,false)); TopDocs result=searcher.search(query, 10000,sort); ScoreDoc[] hit=result.scoreDocs; Map<String,Long> map=new HashMap<String,Long>(); Map<String,Session> map1=new HashMap<String,Session>(); for(int i=0;i<hit.length;i++){ Document doc=searcher.doc(hit[i].doc); String ip=doc.get("ip"); String str=doc.get("date"); String url=doc.get("requset"); Date date=new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US).parse(str); Long pretime=map.get(ip); if(pretime==null||((pretime-date.getTime())/(1000*60)>30)){ if(pretime==null) map.put(ip,date.getTime()); Session session=new Session(); session.setSes(UUID.randomUUID().toString()); session.getList().add(url); list.add(session); map1.put(ip,session); } else{ Session session=map1.get(ip); session.getList().add(url); } } return list; } }
相关文章推荐
- http协议的理解
- Delphi COM编程学习笔记(1)
- android sdk 镜像点
- linux下mysql中文显示乱码,读取又正常
- 19岁程序员在谷歌学到的5条经验教训
- WritePrivateProfileString等读写.ini配置文件
- 统计硬币
- 一个关于“权限正常,但是就是在该文件夹下创建文件失败”
- JZ2440 V2的OpenJTAG驱动安装(USB接口的) win7 64位机
- python2.6升级2.7的方法及升级后的故障处理
- iOS经典讲解之利用单例类封装对SQLite数据库增、删、查、改的操作
- cin.get(),cin,cin.getline()的区别
- Entity Framework 6 预热、启动优化
- python实现批量修改文件扩展名
- 求二叉树的宽度(结点的最大距离)
- XML解析
- AndroidStudio如何使用aar依赖包?
- MYVIMRC
- 虚方法
- poj2553 强连通