您的位置:首页 > 其它

hive、pig对日志处理的对比

2015-09-12 10:57 155 查看
原数据:



每行记录有5部分组成:
1.访问ip
2.访问时间
3.访问资源
4.访问状态
5.本次流量

数据清洗
hive中的数据需要用mr进行处理,然后再导入表中

public static class MyMapper extends
Mapper<LongWritable, Text, NullWritable, Text> {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss" ,
Locale.ENGLISH);
SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyy-MM-dd
HH:mm:ss" );
Date parse;
String date;
@Override
protected void map(LongWritable key,
Text value,
Mapper<LongWritable, Text, NullWritable, Text>.Context context)
throws IOException, InterruptedException
{
String line = value.toString();
Pattern p = Pattern
. compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) \\+0800\\]
\"[A-Z]{1,4} (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");
Matcher m = p.matcher( line);
if (! m.find())
{
return;
}
String ip = m.group(1);
String time = m.group(2);
String url = m.group(3);
String status = m.group(4);
String traffic = m.group(5);
try {
parse = simpleDateFormat.parse( time);
date = simpleDateFormat2.format( parse);
} catch (ParseException e)
{
e.printStackTrace();
}
context.write(NullWritable. get(), new Text(ip + "\t" + date + "\t" + url + "\t" + status + "\t" + traffic ));
}
}
hive创建表并导入数据
create external table logs(ip string,time string, url string, status string, traffic string) partitioned by (date string) row format delimited fields terminated
by '\t' location '/user/15910312725/logout';

load data inpath "/user/15910312725/logout/2015-03-30" into table logs partition (date='2015-03-30');

pig可直接导入原始数据,然后进行数据清洗

logs_tmp = load 'access_2015_03_30.log' using PigStorage(' ') as
(ip:chararray,one:chararray,two:chararray,time:chararray,timezone:chararray,method:chararray,url:chararray,http:chararray,status:long,data:long);

logs = foreach A generate ip,time,url,status,data;

浏览量(pv)



定义:页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,用户每打开一个页面就被记录1 次。

计算公式:记录计数

hive:
create table bbs_pv_2015_03_30 as select count(ip) as num from logs where date='2015-03-30';

pig:
logs_grouped = group logs all;

PV = foreach logs_grouped generate COUNT(logs);

独立IP数



定义:一天之内,访问网站的不同独立IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。

公式:对ip去重求和

hive:
create table bbs_ip_2015_03_30 as select count(distinct ip) as num from logs where date='2015-03-30';

pig:
logs_grouped_ip = group logs by ip;

logs_distinct_ip = foreach logs_grouped_ip generate group;

logs_distinct_ip_grouped = group logs_distinct_ip all;

UV = foreach logs_distinct_ip_grouped generate COUNT(logs_distinct_ip);

注册用户数

计算公式:对访问member.php?mod=register的url,计数

hive:
create table bbs_register_2015_03_30 as select count(1) as register from logs where date='2015-03-30' and instr(url,'member.php?mod=register')>0;

pig:
split logs into regester if (INDEXOF(url, 'member.php?mod=register')>0), non if (INDEXOF(url, 'member.php?mod=register')<0);
//OR: regester = filter logs by INDEXOF(url,'member.php?mod=register')>0;

regester_grouped = group regester all;

regester_num = foreach regester_grouped generate COUNT(regester);

跳出率=跳出数/PV

计算公式:(1)统计一天内只出现一条访问记录的ip,称为跳出数

(2)跳出数/PV
hive:
create table bbs_jumper_2015_03_30 as select count(a.c) as jumper from (select ip,count(ip) as c from logs where date='2015-03-30' group by ip) as a where a.c=1;

pig:
logs_grouped_ip = group logs by ip;

ip_num = foreach logs_grouped_ip generate COUNT(logs.ip) as num;

ip_num_1 = filter ip_num by num==1;

ip_num_1_grouped = group ip_num_1 all;

jumper = foreach ip_num_1_grouped generate COUNT(ip_num_1);

板块访问量pv

定义:一天之内每个板块的访问量

分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据

公式:对板块分组,求每个板块的访问量

hive:
create table bbs_band_ip_2015_03_30 as select b.band as band,count(ip) as ip_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs where date='2015-03-30'
and instr(url,'/forum-')>0) as b group by b.band;

pig:
split logs into logs_contain_band if ((REGEX_EXTRACT(url,'/forum-([0-9]*)-([0-9]*)',1))!=''), non if ((REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1))=='');
//logs_contain_band = filter logs by REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1)!='';

band = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b;

band_grouped = group band by b;

band_PV = foreach band_grouped generate group,COUNT(R.b);

板块独立IP访问量

定义:一天之内每个板块的独立IP访问量

分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据,然后对IP进行去重

公式:对板块分组,IP去重,求每个板块的独立IP访问量

hive:
create table bbs_band_pv_2015_03_30 as select b.band as band,count(distinct ip) as pv_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs
where date='2015-03-30' and instr(url,'/forum-')>0) as b group by b.band;

pig:
band_all_ip = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b,ip;

band_distinct_ip = distinct band;

band_distinct_ip_grouped = group band_distinct_ip by b;

band_UV = foreach band_distinct_ip_grouped generate group,COUNT(band_distinct_ip.b);

汇总表(bbs_info)

日期 log_date
浏览量 pv
注册用户数 reguser
独立ip数 ip
跳出数 jumper

hive:
create table bbs_info as select '2015_03_30',a.dist_ip,b.num,c.register,d.jumper from bbs_ip_2015_03_30 a join bbs_pv_2015_03_30 b on 1=1 join bbs_register_2015_03_30
c on 1=1 join bbs_jumper_2015_03_30 d on 1=1;

pig:
bbs_info = join PV by '1=1', UV by '1=1', jumper_num by '1=1';

版块访问表(bbs_forum)

日期 log_date
版块 forum
独立ip ip
浏览量 pv

hive:
create table bbs_forum as select a.band,ip_num,pv_num from bbs_band_ip_2015_03_30 as a join bbs_band_pv_2015_03_30 as b on a.band=b.band;

pig:
bbs_forum = join band_PV by $0, band_UV by $0;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: