HADOOP 网页日志分析代码详解
2016-03-11 21:09
495 查看
一主代码分析
一.主代码分析
package org.conan.myhadoop; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; public class KPIIP { public static class KPIIPMapper extends Mapper<Object, Text, Text, Text> { private Text word = new Text(); private Text ips = new Text(); public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { KPI kpi = KPI.filterIPs(value.toString()); if (kpi.isValid()) { word.set(kpi.getRequest()); ips.set(kpi.getRemote_addr()); output.collect(word, ips); } } } public static class KPIIPReducer extends Reducer<Text, Text, Text, Text> { private Text result = new Text(); private Set<String> count = new HashSet<String>(); public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { while (values.hasNext()) { count.add(values.next().toString()); } result.set(String.valueOf(count.size())); output.collect(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = Job.getInstance(); job.setJarByClass(KPIIP.class); job.setMapperClass(KPIIPMapper.class); job.setCombinerClass(KPIIPReducer.class); job.setReducerClass(KPIIPReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
package org.conan.myhadoop;表示包的命名
public static class KPIIPMapper extends Mapper 继承类mapper,object表示任意的类型
private Text word = new Text();
private Text ips = new Text(); 定义数据类型 text 的实例
HashSet类主要是设计用来做高性能集运算的,例如对两个集合求交集、并集、差集等。集合中包含一组不重复出现且无特性顺序的元素。
/article/4677933.html
package org.conan.myhadoop.mr.kpi; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Locale; import java.util.Set; /* * KPI Object */ public class KPI { private String remote_addr; private String remote_user; private String time_local; private String request; private String status; private String body_bytes_sent; private String http_referer; private String http_user_agent; private boolean valid = true; private static KPI parser(String line) { System.out.println(line); KPI kpi = new KPI(); String[] arr = line.split(" "); if (arr.length > 11) { kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); if (arr.length > 12) { kpi.setHttp_user_agent(arr[11] + " " + arr[12]); } else { kpi.setHttp_user_agent(arr[11]); } if (Integer.parseInt(kpi.getStatus()) >= 400) {// \u5927\u4e8e400\uff0cHTTP\u9519\u8bef kpi.setValid(false); } } else { kpi.setValid(false); } return kpi; } /** * \u6309page\u7684pv\u5206\u7c7b */ public static KPI filterPVs(String line) { KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/about"); pages.add("/black-ip-list/"); pages.add("/cassandra-clustor/"); pages.add("/finance-rhive-repurchase/"); pages.add("/hadoop-family-roadmap/"); pages.add("/hadoop-hive-intro/"); pages.add("/hadoop-zookeeper-intro/"); pages.add("/hadoop-mahout-roadmap/"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi; } /** * \u6309page\u7684\u72ec\u7acbip\u5206\u7c7b */ public static KPI filterIPs(String line) { KPI kpi = parser(line); Set<String> pages = new HashSet<String>(); pages.add("/about"); pages.add("/black-ip-list/"); pages.add("/cassandra-clustor/"); pages.add("/finance-rhive-repurchase/"); pages.add("/hadoop-family-roadmap/"); pages.add("/hadoop-hive-intro/"); pages.add("/hadoop-zookeeper-intro/"); pages.add("/hadoop-mahout-roadmap/"); if (!pages.contains(kpi.getRequest())) { kpi.setValid(false); } return kpi; } /** * PV\u6309\u6d4f\u89c8\u5668\u5206\u7c7b */ public static KPI filterBroswer(String line) { return parser(line); } /** * PV\u6309\u5c0f\u65f6\u5206\u7c7b */ public static KPI filterTime(String line) { return parser(line); } /** * PV\u6309\u8bbf\u95ee\u57df\u540d\u5206\u7c7b */ public static KPI filterDomain(String line){ return parser(line); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("valid:" + this.valid); sb.append("\nremote_addr:" + this.remote_addr); sb.append("\nremote_user:" + this.remote_user); sb.append("\ntime_local:" + this.time_local); sb.append("\nrequest:" + this.request); sb.append("\nstatus:" + this.status); sb.append("\nbody_bytes_sent:" + this.body_bytes_sent); sb.append("\nhttp_referer:" + this.http_referer); sb.append("\nhttp_user_agent:" + this.http_user_agent); return sb.toString(); } public String getRemote_addr() { return remote_addr; } public void setRemote_addr(String remote_addr) { this.remote_addr = remote_addr; } public String getRemote_user() { return remote_user; } public void setRemote_user(String remote_user) { this.remote_user = remote_user; } public String getTime_local() { return time_local; } public Date getTime_local_Date() throws ParseException { SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); return df.parse(this.time_local); } public String getTime_local_Date_hour() throws ParseException{ SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH"); return df.format(this.getTime_local_Date()); } public void setTime_local(String time_local) { this.time_local = time_local; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getBody_bytes_sent() { return body_bytes_sent; } public void setBody_bytes_sent(String body_bytes_sent) { this.body_bytes_sent = body_bytes_sent; } public String getHttp_referer() { return http_referer; } public String getHttp_referer_domain(){ if(http_referer.length()<8){ return http_referer; } String str=this.http_referer.replace("\"", "").replace("http://", "").replace("https://", ""); return str.indexOf("/")>0?str.substring(0, str.indexOf("/")):str; } public void setHttp_referer(String http_referer) { this.http_referer = http_referer; } public String getHttp_user_agent() { return http_user_agent; } public void setHttp_user_agent(String http_user_agent) { this.http_user_agent = http_user_agent; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } public static void main(String args[]) { String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\""; System.out.println(line); KPI kpi = new KPI(); String[] arr = line.split(" "); kpi.setRemote_addr(arr[0]); kpi.setRemote_user(arr[1]); kpi.setTime_local(arr[3].substring(1)); kpi.setRequest(arr[6]); kpi.setStatus(arr[8]); kpi.setBody_bytes_sent(arr[9]); kpi.setHttp_referer(arr[10]); kpi.setHttp_user_agent(arr[11] + " " + arr[12]); System.out.println(kpi); try { SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss", Locale.US); System.out.println(df.format(kpi.getTime_local_Date())); System.out.println(kpi.getTime_local_Date_hour()); System.out.println(kpi.getHttp_referer_domain()); } catch (ParseException e) { e.printStackTrace(); } } }
字符串截取,substring(int beginIndex) 返回一个新的字符串,它是此字符串的一个子字符串。
substring(int beginIndex, int endIndex) 返回一个新字符串,它是此字符串的一个子字符串。
beginIndex - 起始索引(包括)。从0开始
endIndex - 结束索引(不包括)。
“unhappy”.substring(2) returns “happy”
“hamburger”.substring(4, 8) returns “urge”
具体代码和相关数据请见链接:http://pan.baidu.com/s/1dDZvczf 密码:mac7
相关文章推荐
- github page
- C语言内存对齐
- Python常用知识点
- Python基础--不可变序列:元组
- Python基础--不可变序列:元组
- 使用Spring AOP预处理Controller的参数
- 第 7 章 浏览器对象
- C++中虚函数工作原理和(虚)继承类的内存占用大小计算
- 链表的回文结构
- 《JAVA课程设计》实训第四天——《猜猜看》游戏
- 1008_第几天
- C++中的指针和动态分配二维数组问题
- GitHub关键词
- Atitit.编程语言and 自然语言的比较and 编程语言未来的发展
- github更新自己Fork的代码
- Atitit.编程语言and 自然语言的比较and 编程语言未来的发展
- Atitit.编程语言and 自然语言的比较and 编程语言未来的发展
- Java Gradle入门指南之gretty插件(安装、命令与核心特性)
- python3 字符串属性(三)
- 国外程序员整理的 C++ 资源大全