您的位置:首页 > 编程语言 > Python开发

hive之Python UDF函数操作map数据 详解 全是干货

2017-09-30 10:23 816 查看
#1、Hive基本操作:

查看dw.full_h_usr_base_user的详细信息,可以获取数据文件的存放路径

desc formatted dw.full_h_usr_base_user;

dfs -ls dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user;

删除外表full_h_usr_base_user的数据

dfs -rmdir dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user;

#192.168.1.181 192.168.1.1

#2、创建带有map数据类型的外表

create external table dw.full_h_usr_base_user(

user_id  string    comment '用户id',

reg_ip  string    comment 'ip',

reg_ip_geo_map map<string,string> comment --map数据类型创建方法

'city_id,city_name,isp,province_id,province_name,country_id,country_name,postzip,district,province'

)

comment '用户测试表'

partitioned by(ds string comment '当前时间,用于分区字段')

row format delimited

fields terminated by '\t'

collection items terminated by ","--map键值对逗号分割

map keys terminated by ":"--map键值冒号分割

stored as TEXTFILE;--存储为文本类型

#3、加载数据(指定user_id和reg_ip即可,reg_ip_geo_map可以通过UDF运算出来)

load data local inpath '/opt/data/dw.full_h_usr_base_user.del' 

overwrite into table dw.full_h_usr_base_user partition(ds='2017-09-25');

4000

#4、自定义函数:Python UDF函数ip_to_num.py

#coding=utf-8


#Version:python3.5.2


#Tools:Pycharm


#Date:


__author__ = "Colby"


import socket


import struct


import sys,datetime


ipDB='/opt/data/IP_utf-8.csv'


for line in sys.stdin:


line = line.strip()


user_id, reg_ip, reg_ip_geo_map, ds = line.split('\t')


num_ip = int(socket.ntohl(struct.unpack("I", socket.inet_aton(str(reg_ip)))[0]))


f = open(ipDB, 'r', encoding="utf-8")


ipDict = {}


count = 0


for line in f:


if count == 9:


count += 1


continue


line = line.split(',')


if int(line[2]) <= num_ip and int(line[3]) >= num_ip:


ipDict['IP'] = reg_ip


ipDict['nationality'] = line[4]


ipDict['province'] = line[5]


ipDict['city'] = line[6]


ipDict['Corporation'] = line[8]


reg_ip_geo_map=str(ipDict)[1:-1].replace('\'','').replace(' ','')


print('\t'.join([user_id, reg_ip, reg_ip_geo_map,ds]))


f.close()


#5、将udf函数文件上传文件到服务器指定目录 /opt/udf/

/opt/udf/ip_to_num.py

#6、进入hive命令行,add文件

add file /opt/udf/ip_to_num.py;

#Added resources: [/opt/udf/ip_to_num.py]

#7、使用udf函数并进行测试

SELECT

  TRANSFORM (user_id, reg_ip, reg_ip_geo_map, ds)

  USING 'python3 ip_to_num.py'

  AS (user_id, reg_ip, reg_ip_geo_map, ds)

FROM dw.full_h_usr_base_user;

#8、函数处理数据,并且overwrite表dw.full_h_usr_base_user,注意动态分区参数

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dw.full_h_usr_base_user partition(ds)

select user_id

,reg_ip

,str_to_map(reg_ip_geo_map,',',':') as reg_ip_geo_map

,ds from (

SELECT

  TRANSFORM (user_id, reg_ip, reg_ip_geo_map,ds)

  USING 'python3 ip_to_num.py'

  AS (user_id, reg_ip, reg_ip_geo_map,ds)

FROM dw.full_h_usr_base_user

) as a

;

#9、查询处理好的数据,学会查询map类型的数据

select user_id

,reg_ip_geo_map['province'] 

,reg_ip_geo_map['city'] 

,reg_ip_geo_map['nationality'] 

from dw.full_h_usr_base_user

where ds='2017-09-25' and user_id='1000000015';

输出结果:

OK

1000000015      安徽省  合肥市  中国

Time taken: 0.107 seconds, Fetched: 1 row(s)

#动态分区,将字符创转换成MAP

#set hive.exec.dynamic.partition.mode=nonstrict;

#insert into dw.full_h_usr_base_user partition(ds) 

#select user_id

#, reg_ip

#, str_to_map(reg_ip_geo_map) reg_ip_geo_map

#,ds from dw.full_h_usr_base_user_tmp;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Hive Python UDF Map