hive知识分享
2016-06-03 10:54
495 查看
表
内外部表
表与数据同在/不同在删除表的区别
创建一个表实例
create <external> table cr_cdma_fin ( bil_month string COMMENT "中文注释", accs_nbr string, asset_row_id string, fin_amt float, owe_amt float, owe_month string, owe_dur string, latn_id string) <partitioned by (month string)> row format delimited fields terminated by '\t' <location '/credit/bic/m/cr_cdma_fin/'>;
(多级/静态/动态)分区表
结合业务场景来理解 例如按城市分区、按时间分区等意义包括:化大为小,方便管理;查询效率提高
不是实际表字段,但可以像表字段一样使用
查看分区
SHOW PARTITIONS <table name>
创建(内外部)分区表
增加/删除分区
alter table <table name> add if not exists partition (p1 = "xxx",p2 = "yyy") location '/....../xxx/yyy'; INSERT OVERWRITE table test_beilun partition (p1="aa",p2="bb") select xx,xx,xx from xxx where yyy; ALTER TABLE <table name> DROP IF EXISTS PARTITION(p="xxx");
动态分区
大批量多级分区数据导入;二级分区未知,需要自动识别
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; --- SELECT * From partition_test_input; --- 20110526 1 liujiannan liaoning --- 20110526 2 wangchaoqun hubei --- 20110728 3 xuhongxing sichuan --- 20110728 4 zhudaoyong henan --- 20110728 5 zhouchengyu heilongjiang insert overwrite table partition_test partition(stat_date='20110728',province) select member_id,name,province from partition_test_input where stat_date='20110728';
导入数据
导入本地数据
LOAD DATA LOCAL INPATH '/home/hadoop/name/tmp' INTO TABLE tmp <PARTITION (day = XXX)>;
导入集群数据
LOAD DATA INPATH '/home/hadoop/name/tmp' INTO TABLE tmp <PARTITION (day = XXX)>; alter table NET_CDR_DS_FIX_O add if not exists partition (dt=20151027) location '/SOURCE/RAW/D/20151027';
从别的表中select数据到对应分区
INSERT OVERWRITE table test_beilun partition (day=20150514) select id,timestamps,url,ref,day,hour from table_name where day= 20150514 and parse_url(concat("http://",url),'HOST') like '%4399.com';
导出数据,保存查询结果
写出到本地
insert overwrite local directory '/home/hadoop/name/aa/bb/' select * from dpi_cdr_zj a join test_aaa b on a.ad=b.time1 and a.input_day=20141119;
写出到集群
insert overwrite directory '/home/hadoop/name/aa/bb/' select * from dpi_cdr_zj a join test_aaa b on a.ad=b.time1 and a.input_day=20141119;
查询结果另存为一个hive表
create table ti_mbl_call_roam_d AS select hplmn2,md5udf(accs_nbr),md5udf(ccust_row_id),md5udf(asset_row_id),roam_type_cd,vplmn2,count,dur from ti_mbl_call_roam_d1;
函数
自带函数及使用
聚合函数 sum count avg max min …
判断函数 if case when
字符串处理函数 split concat len
SELECT split(trim(Latn_Id),'|')[0] as Latn_Id, stmt_dt, bil_asset_row_id, Bil_Accs_Nbr, concat(a,b) as ab, sum (1), count (distinct opp_nbr), count (Trans_A_Nbr), sum (case when Toll_Type_Cd==0 and Roam_Type_Cd==0 then dur else 0 end), sum (case when Toll_Type_Cd==0 and Roam_Type_Cd==0 then 1 else 0 end), count(distinct if(Toll_Type_Cd==0 and Roam_Type_Cd==0,latn_id,null)) FROM hadoop_db.net_cdr_vs_o WHERE DAY LIKE '201506%' and Bil_User_Type_Cd=0 GROUP BY Latn_Id,stmt_dt,bil_asset_row_id,Bil_Accs_Nbr,ab;
url处理函数 parse_url
返回URL中指定的部分。partToExtract的有效值为:HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO.--京东搜索“地毯” parse_url("http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52","HOST") --search.jd.com parse_url('http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52',"QUERY") --keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52 parse_url('http://search.jd.com/Search?keyword=%E5%9C%B0%E6%AF%AF&enc=utf-8&wq=%E5%9C%B0%E6%AF%AF&pvid=zqx6a6pi.8k3oi90v9en52',"QUERY","wq") --%E5%9C%B0%E6%AF%AF
UDF及使用
jar包
add jar /data1/middle/lib/Md5.jar; create temporary function md5 as 'md5.Md5UDF'; select md5(ccust_name),md5(id_nbr),date_cd from par_cust_cur where id_nbr like '65%' and day like '201504%' and length(ccust_name)>3 and length(id_nbr)=18 limit 5;
python 实现一个方法
import sys for line in sys.stdin: line = line.strip() context = line.split("\t") s = int(context[0])+int(context[1]) print s
hive> set hive.exec.mode.local.auto=true; hive> select * from udf_test; OK 10 6 5 8 79 33 100 1 99 8 70 22 hive> add file /home/common/zl/udf/a.py; Added resources: [/home/common/zl/udf/a.py] hive> SELECT > TRANSFORM(id1,id2) > USING 'python a.py' > AS(xx) > FROM udf_test; 16 13 112 101 107 92
python写udf时,可以传参
import sys def fun1(): for line in sys.stdin: line = line.strip() context = line.split("\t") s = int(context[0])+int(context[1]) print s def fun2(): for line in sys.stdin: line = line.strip() context = line.split("\t") q = int(context[0])-int(context[1]) print q tmp = sys.argv[1] fun = "fun"+str(tmp)+"("+")" exec(fun)
hive> SELECT > TRANSFORM(id1,id2) > USING 'python aa.py 2' > AS(xx) > FROM udf_test; OK 4 -3 46 99 91 48
宏
hive> create temporary macro sigmoid(x double) x/20; OK Time taken: 0.038 seconds hive> select sigmoid(500); OK 25.0 hive> DROP TEMPORARY macro IF EXISTS sigmoid; OK Time taken: 0.011 seconds
综合知识
指定任务队列,设置任务分块大小,关闭日志
set mapreduce.job.queuename=Q_yingykf01; set mapreduce.job.name=Zhaoli_offset_20160101_007011; set mapreduce.map.memory.mb=9120; set mapreduce.reduce.memory.mb=9120;
指定压缩/序列化的文件格式
$HIVE_HOME/conf/hive-default.xmlhive.exec.compress.output
hive.exec.compress.intermediate
Bzip2 Gzip LZO Snappy
压缩比:bzip2 > gzip > lzo,速度:lzo > gzip > bzip2
CLI
--中间结果压缩 set hive.exec.compress.intermediate=true; set mapred.map.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec; create table comtab8 row format delimited fields terminated by "\t" as select * from udf_test;
--输出结果压缩 --查看hdfs对应的文件时,文件的后缀会显示压缩格式 --cat查看时,格式乱;text查看;没有头文件信息 set hive.exec.compress.output=true; set hive.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec; create table comtab4 row format delimited fields terminated by "\t" as select * from udf_test;
关键字STORED AS
指定表在HDFS上的文件存储格式,可选的文件存储格式有:
TEXTFILE //文本,默认值
SEQUENCEFILE // 二进制序列文件
RCFILE //列式存储格式文件 Hive0.6以后开始支持
ORC //列式存储格式文件,比RCFILE有更高的压缩比和读写效率,Hive0.11以后开始支持
PARQUET //列出存储格式文件,Hive0.13以后开始支持
SEQUENCEFILE
create table comtab6 ( id1 string, id2 string) row format delimited fields terminated by '\t' stored as SEQUENCEFILE --cat查看文件头显示信息 --SEQ"org.apache.hadoop.io.BytesWritableorg.apache.hadoop.io.Text&org.apache.hadoop.io.compress.LzoCodec set hive.exec.compress.output=true; set mapred.output.compress=true; set mapred.output.compression.type=BLOCK; set mapred.output.compression.codec=org.apache.hadoop.io.compress.LzoCodec; INSERT OVERWRITE TABLE comtab6 select * from udf_test;
指定本地模式执行set hive.exec.mode.local.auto=true
配置如下参数,可以开启Hive的本地模式: hive> set hive.exec.mode.local.auto=true;(默认为false)
hive -e执行
hive -e"select * from zl.udf_test;" >> /home/zl/udf_test.txt
写入sh脚本执行
#!/usr/bin/bash #test.sh hive -e" set mapreduce.job.queuename=Q_yingykf01; set mapreduce.job.name=Zhaoli_offset_20160101_007011; select * from zl.udf_test; "
---执行 sh test.sh >> test.txt
传参调用
方法1#!/usr/bin/bash #a.sh tablename="student" limitcount="8" hive -e "use test; select * from ${tablename} limit ${limitcount};"
sh a.sh
方法2
--a.sql; set hive.exec.mode.local.auto=true; use zl; select * from ${hiveconf:tablename};
hive -hiveconf tablename=udf_test -f a.sql
方法3
--b.sql; set hive.exec.mode.local.auto=true; use zl; select * from ${hivevar:tablename};
hive -hivevar tablename=udf_test -f b.sql
hadoop shell
hadoop fs -ls 查看目录 hadoop fs -du -h 查看文件大小 hadoop fs -mkdir 创建目录 hadoop fs -put/get 上传下载文件 hadoop fs -text XXX 查看文件内容
实例结尾
1、通过某天一小时的gdpi数据,总结顶级域名(.com、.cn等等)有哪些2、总结京东url的二级域名、三级域名分别有哪些
要求:
1 在测试服务器5.2hive环境,建立自己的测试库
2 在自己的测试库内,创建外部分区表,导入一个小时测试数据
3 根据逻辑,编写hql
4 hql测试通过后,保存为sh文件,将时间(天,小时)作为参数传入sh文件执行
5 执行结果另存为本地文件
相关文章推荐
- 分享Hive的一份胶片资料
- Hadoop生态上几个技术的关系与区别:hive、pig、hbase 关系与区别
- 将Hive的默认数据库Derby改为Postgresql
- kettle中对hive操作时需要知道的东西
- Hive安装配置
- Hive - truncate partition、drop partition 区别
- #Note# Analyzing Twitter Data with Apache Hadoo...
- 大数据实验室(大数据基础培训)——Hive的安装、配置及基础使用
- [翻译]Hive wiki GettingStarted
- hive命令积累
- 启动hive命令报错 “Metastore contains multiple versions”
- sparksql与hive整合
- hive on spark 编译
- sqoop 中文文档 User guide 一
- sqoop 中文文档 User guide 二 import
- sqoop 中文文档 User guide 二 import续
- sqoop 中文文档 User guide 三 export
- sqoop 中文文档 User guide 四 validation
- sqoop 中文文档 User guide 五 job,metastore,merge,codegen