sqoop&Hive实例
2016-06-28 16:47
176 查看
案例:
1. 把原始log数据加载到表beifenglog中;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
1. 把原始log数据加载到表beifenglog中;
加载原表数据
load data local inpath '/opt/test/beifenglog.data' overwrite into table beifenglog;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
报错1:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.ClassNotFoundException: Class org.apache.hadoop.hive.contrib.serde2.RegexSerDe not found
at org.apache.hadoop.hive.ql.exec.MapOperator.getConvertedOI(MapOperator.java:334)
at org.apache.hadoop.hive.ql.exec.MapOperator.setChildren(MapOperator.java:352)
at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.configure(ExecMapper.java:126)
解决: hive-site.xml
报错2:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.IllegalArgumentException: No enum constant org.apache.hadoop.hive.ql.io.orc.CompressionKind.snappy
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.createBucketFiles(FileSinkOperator.java:469)
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:550)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:92)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:549)
... 9 more
解决:重新创建子表
("orc.compress"="snappy") --> ("orc.compression"="snappy")
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
FormatData.java内容如下:
RemoveQuote.java内容如下:
1. 把原始log数据加载到表beifenglog中;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
1. 把原始log数据加载到表beifenglog中;
create table beifenglog( remote_addr string, remote_user string, time_local string, request string, status string, body_bytes_sent string, request_body string, http_referer string, http_user_agent string, http_x_forwarded_for string, host string) row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' with serdeproperties( "input.regex" = "(\\\"[\\d\\.]+\\\") (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\".*?\\\") (\\\"\\d+\\\") (\\\"\\d+\\\") ([^ ]+) (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\"[^ ]+\\\") (\\\"[^ ]+\\\")" ) stored as textfile;
加载原表数据
load data local inpath '/opt/test/beifenglog.data' overwrite into table beifenglog;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
create table beifenglog_hour_visit( remote_addr string, time_local string, request string, http_referer string) row format delimited fields terminated by '\t' stored as orc tblproperties ("orc.compression"="snappy"); insert overwrite table beifenglog_hour_visit select remote_addr,time_local,request,http_referer from beifenglog ;
报错1:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.ClassNotFoundException: Class org.apache.hadoop.hive.contrib.serde2.RegexSerDe not found
at org.apache.hadoop.hive.ql.exec.MapOperator.getConvertedOI(MapOperator.java:334)
at org.apache.hadoop.hive.ql.exec.MapOperator.setChildren(MapOperator.java:352)
at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.configure(ExecMapper.java:126)
解决: hive-site.xml
<property> <name>hive.aux.jars.path</name> <value>file:///opt/modules/cdh/hive-0.13.1-cdh5.3.6/lib/hive-contrib-0.13.1-cdh5.3.6.jar</value> <description>Added by tiger.zeng on 20120202.These JAR file are available to all users for all jobs</description> </property>
报错2:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.IllegalArgumentException: No enum constant org.apache.hadoop.hive.ql.io.orc.CompressionKind.snappy
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.createBucketFiles(FileSinkOperator.java:469)
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:550)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:92)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:549)
... 9 more
解决:重新创建子表
("orc.compress"="snappy") --> ("orc.compression"="snappy")
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
add jar /opt/test/hive.jar; create temporary function removequote as 'org.gh.hadoop.hive.test.RemoveQuote'; create temporary function formatdate as 'org.gh.hadoop.hive.test.FormatDate'; insert overwrite table beifenglog_hour_visit select removequote(remote_addr),formatdate(removequote(time_local)),removequote(request),removequote(http_referer) from beifenglog ;
FormatData.java内容如下:
import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import org.apache.hadoop.hive.ql.exec.UDF; public class FormatDate extends UDF{ private SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); private SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); public String evaluate(String dateStr){ if(dateStr == null){ return null; } Date date = null; String ret = null; try{ date = inputFormat.parse(dateStr); ret = outputFormat.format(date); }catch(Exception e){ e.printStackTrace(); return null; } return ret; } }
RemoveQuote.java内容如下:
package org.gh.hadoop.hive.test; import org.apache.hadoop.hive.ql.exec.UDF; public class RemoveQuote extends UDF{ public String evaluate(String str){ if(str == null){ return null; } return str.replaceAll("\"", ""); } }
相关文章推荐
- MAC下date -r 的使用
- Camera driver&V4L2驱动架构介绍
- windows 安全设置
- Linux grep命令用法以及正则表达式
- (转) Linux调优方案,sysctl.conf的设置
- SQL Server 启用 xp_cmdshell 与bcp 使用
- FastDFS安装与nginx反向代理配置
- 研读《高可用多节点集群技术的研究和实现》-----集群系统的软件体系结构
- linux超级终端minicom的使用方法
- 支付系统开发--简易版支付系统介绍
- Linux 文件IO
- 1千用户与1千万用户的网站系统架构区别?
- 爱Linux、爱开源的微软正式发布了.NET Core 1.0
- Linux系统SCSI磁盘扫描机制解析及命令实例
- 全面了解 Linux 服务器 - 3. 查看 Linux 服务器的硬盘使用情况
- 第7章 构建大型网站的其他因素
- linux小知识之/etc/sudoers文件应用实例
- 使用consul实现服务的注册和发现
- 1号店交易系统架构如何向「高并发高可用」演进
- Linux系统查看版本信息