您的位置:首页 > 其它

hive知识分享

2016-06-03 10:54 495 查看

内外部表

表与数据同在/不同在

删除表的区别

创建一个表实例

create <external> table  cr_cdma_fin
(
bil_month string COMMENT "中文注释",
accs_nbr string,
asset_row_id string,
fin_amt float,
owe_amt float,
owe_month string,
owe_dur string,
latn_id string)
<partitioned by (month  string)>
row format delimited
fields terminated by '\t'
<location '/credit/bic/m/cr_cdma_fin/'>;


(多级/静态/动态)分区表

结合业务场景来理解 例如按城市分区、按时间分区等

意义包括:化大为小,方便管理;查询效率提高

不是实际表字段,但可以像表字段一样使用

查看分区

SHOW PARTITIONS <table name>


创建(内外部)分区表

增加/删除分区

alter table <table name> add if not exists partition (p1 = "xxx",p2 = "yyy") location '/....../xxx/yyy';

INSERT OVERWRITE table test_beilun partition (p1="aa",p2="bb")
select xx,xx,xx
from xxx
where yyy;

ALTER TABLE <table name> DROP IF EXISTS PARTITION(p="xxx");


动态分区

大批量多级分区数据导入;二级分区未知,需要自动识别

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

--- SELECT * From partition_test_input;
--- 20110526 1 liujiannan liaoning
--- 20110526 2 wangchaoqun hubei
--- 20110728 3 xuhongxing sichuan
--- 20110728 4 zhudaoyong henan
--- 20110728 5 zhouchengyu heilongjiang

insert overwrite table partition_test partition(stat_date='20110728',province)
select member_id,name,province from partition_test_input where stat_date='20110728';


导入数据

导入本地数据

LOAD DATA LOCAL INPATH '/home/hadoop/name/tmp' INTO TABLE tmp <PARTITION (day = XXX)>;


导入集群数据

LOAD DATA INPATH '/home/hadoop/name/tmp' INTO TABLE tmp <PARTITION (day = XXX)>;

alter table NET_CDR_DS_FIX_O  add if not exists partition (dt=20151027) location '/SOURCE/RAW/D/20151027';


从别的表中select数据到对应分区

INSERT OVERWRITE table test_beilun partition (day=20150514)
select id,timestamps,url,ref,day,hour
from table_name
where day= 20150514 and parse_url(concat("http://",url),'HOST') like '%4399.com';


导出数据,保存查询结果

写出到本地

insert overwrite local directory '/home/hadoop/name/aa/bb/'
select *
from dpi_cdr_zj a join test_aaa b
on a.ad=b.time1
and a.input_day=20141119;


写出到集群

insert overwrite directory '/home/hadoop/name/aa/bb/'
select *
from dpi_cdr_zj a join test_aaa b
on a.ad=b.time1
and a.input_day=20141119;


查询结果另存为一个hive表

create table ti_mbl_call_roam_d AS
select hplmn2,md5udf(accs_nbr),md5udf(ccust_row_id),md5udf(asset_row_id),roam_type_cd,vplmn2,count,dur
from ti_mbl_call_roam_d1;


函数

自带函数及使用

聚合函数 sum count avg max min …

判断函数 if case when

字符串处理函数 split concat len

SELECT
split(trim(Latn_Id),'|')[0] as Latn_Id,
stmt_dt,
bil_asset_row_id,
Bil_Accs_Nbr,
concat(a,b) as ab,
sum (1),
count (distinct opp_nbr),
count (Trans_A_Nbr),
sum (case when Toll_Type_Cd==0 and Roam_Type_Cd==0 then dur else 0 end),
sum (case when Toll_Type_Cd==0 and Roam_Type_Cd==0 then 1 else 0 end),
count(distinct if(Toll_Type_Cd==0 and Roam_Type_Cd==0,latn_id,null))
FROM hadoop_db.net_cdr_vs_o
WHERE DAY LIKE '201506%' and Bil_User_Type_Cd=0
GROUP BY Latn_Id,stmt_dt,bil_asset_row_id,Bil_Accs_Nbr,ab;


url处理函数 parse_url

返回URL中指定的部分。partToExtract的有效值为:HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO.

--京东搜索“地毯”
parse_url("http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52","HOST") --search.jd.com

parse_url('http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52',"QUERY") --keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52

parse_url('http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52',"QUERY","wq") --%E5%9C%B0%E6%AF%AF


UDF及使用

jar包

add jar /data1/middle/lib/Md5.jar;
create temporary function  md5 as 'md5.Md5UDF';
select md5(ccust_name),md5(id_nbr),date_cd
from par_cust_cur
where id_nbr like '65%'
and day like '201504%'
and length(ccust_name)>3
and length(id_nbr)=18
limit 5;


python 实现一个方法

import sys
for line in sys.stdin:
line = line.strip()
context = line.split("\t")
s = int(context[0])+int(context[1])
print s


hive> set hive.exec.mode.local.auto=true;
hive> select * from udf_test;
OK
10      6
5       8
79      33
100     1
99      8
70      22
hive> add file /home/common/zl/udf/a.py;
Added resources: [/home/common/zl/udf/a.py]
hive> SELECT
> TRANSFORM(id1,id2)
> USING 'python a.py'
>    AS(xx)
>  FROM udf_test;
16
13
112
101
107
92


python写udf时,可以传参

import sys

def fun1():
for line in sys.stdin:
line = line.strip()
context = line.split("\t")
s = int(context[0])+int(context[1])
print s

def fun2():
for line in sys.stdin:
line = line.strip()
context = line.split("\t")
q = int(context[0])-int(context[1])
print q

tmp = sys.argv[1]
fun = "fun"+str(tmp)+"("+")"
exec(fun)


hive> SELECT
> TRANSFORM(id1,id2)
> USING 'python aa.py 2'
>    AS(xx)
>  FROM udf_test;

OK
4
-3
46
99
91
48


hive> create temporary macro sigmoid(x double) x/20;
OK
Time taken: 0.038 seconds
hive> select sigmoid(500);
OK
25.0

hive> DROP TEMPORARY macro IF EXISTS sigmoid;
OK
Time taken: 0.011 seconds


综合知识

指定任务队列,设置任务分块大小,关闭日志

set mapreduce.job.queuename=Q_yingykf01;
set mapreduce.job.name=Zhaoli_offset_20160101_007011;
set mapreduce.map.memory.mb=9120;
set mapreduce.reduce.memory.mb=9120;


指定压缩/序列化的文件格式

$HIVE_HOME/conf/hive-default.xml

hive.exec.compress.output

hive.exec.compress.intermediate

Bzip2 Gzip LZO Snappy

压缩比:bzip2 > gzip > lzo,速度:lzo > gzip > bzip2

CLI

--中间结果压缩
set hive.exec.compress.intermediate=true;
set mapred.map.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
create table comtab8
row format delimited fields terminated by "\t"
as select * from udf_test;


--输出结果压缩
--查看hdfs对应的文件时,文件的后缀会显示压缩格式
--cat查看时,格式乱;text查看;没有头文件信息
set hive.exec.compress.output=true;
set hive.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;
create table comtab4
row format delimited fields terminated by "\t"
as select * from udf_test;


关键字STORED AS

指定表在HDFS上的文件存储格式,可选的文件存储格式有:

TEXTFILE //文本,默认值

SEQUENCEFILE // 二进制序列文件

RCFILE //列式存储格式文件 Hive0.6以后开始支持

ORC //列式存储格式文件,比RCFILE有更高的压缩比和读写效率,Hive0.11以后开始支持

PARQUET //列出存储格式文件,Hive0.13以后开始支持

SEQUENCEFILE

create table comtab6 (
id1 string,
id2 string)
row format delimited
fields terminated by '\t'
stored as SEQUENCEFILE
--cat查看文件头显示信息
--SEQ"org.apache.hadoop.io.BytesWritableorg.apache.hadoop.io.Text&org.apache.hadoop.io.compress.LzoCodec
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.type=BLOCK;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.LzoCodec;
INSERT OVERWRITE TABLE comtab6
select *
from udf_test;


指定本地模式执行set hive.exec.mode.local.auto=true

配置如下参数,可以开启Hive的本地模式:
hive> set hive.exec.mode.local.auto=true;(默认为false)


hive -e执行

hive -e"select * from zl.udf_test;" >> /home/zl/udf_test.txt


写入sh脚本执行

#!/usr/bin/bash
#test.sh
hive -e"
set mapreduce.job.queuename=Q_yingykf01;
set mapreduce.job.name=Zhaoli_offset_20160101_007011;
select *
from zl.udf_test;
"


---执行
sh test.sh >> test.txt


传参调用

方法1

#!/usr/bin/bash
#a.sh
tablename="student"
limitcount="8"

hive -e "use test; select * from ${tablename} limit ${limitcount};"


sh a.sh


方法2

--a.sql;
set hive.exec.mode.local.auto=true;
use zl;
select *
from ${hiveconf:tablename};


hive -hiveconf tablename=udf_test -f a.sql


方法3

--b.sql;
set hive.exec.mode.local.auto=true;
use zl;
select *
from ${hivevar:tablename};


hive -hivevar tablename=udf_test -f b.sql


hadoop shell

hadoop fs -ls        查看目录
hadoop fs -du -h     查看文件大小
hadoop fs -mkdir     创建目录
hadoop fs -put/get   上传下载文件
hadoop fs -text XXX  查看文件内容


实例结尾

1、通过某天一小时的gdpi数据,总结顶级域名(.com、.cn等等)有哪些

2、总结京东url的二级域名、三级域名分别有哪些

要求:

1 在测试服务器5.2hive环境,建立自己的测试库

2 在自己的测试库内,创建外部分区表,导入一个小时测试数据

3 根据逻辑,编写hql

4 hql测试通过后,保存为sh文件,将时间(天,小时)作为参数传入sh文件执行

5 执行结果另存为本地文件
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  hive