hbase批量导入2
2016-04-09 00:00
288 查看
摘要: 对上篇hbase批量导入进行了代码更改,实现了一次性导入300个csv文件,768000行记录,但是不能够一下子导入2000+文件,提示内存运行超过安全值,不知道为什么?(我的是伪分布hadoop,虚拟机里面运行)
代码参考网上的批量导入hbase的代码,连接忘记了
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class UptoHBase {
/**
* @param args
*/
public static void main(String[] args) throws java.io.IOException, InterruptedException,ClassNotFoundException{
// TODO Auto-generated method stub
final Configuration configuration = new Configuration();
// 设置zookeeper
//configuration.set("hbase.zookeeper.quorum", "hadoop1");
configuration.set("hbase.zookeeper.quorum", "localhost");
// 设置hbase表名称
// configuration.set(TableOutputFormat.OUTPUT_TABLE, "wlan_log");
//configuration.set(TableOutputFormat.OUTPUT_TABLE, "testhbase");
configuration.set(TableOutputFormat.OUTPUT_TABLE, "Bearing1_1_acc");
// 将该值改大,防止hbase超时退出
configuration.set("dfs.socket.timeout", "180000");
final Job job = new Job(configuration, "HBaseBatchImport");
job.setMapperClass(BatchImportMapper.class);
job.setReducerClass(BatchImportReducer.class);
// 设置map的输出,不设置reduce的输出类型
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
// 不再设置输出路径,而是设置输出格式类型
job.setOutputFormatClass(TableOutputFormat.class);
// FileInputFormat.setInputPaths(job, "hdfs://hadoop1:9000/input");
// FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/user/hadoop/testhbase");
FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/user/hadoop/Bearing1_1_acc");
job.waitForCompletion(true);
}
static class BatchImportMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
Text v2 = new Text();
protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException {
//final String[] splited = value.toString().split("\t");
final String[] splited = value.toString().split(",");
//System.out.println("%%%%%%%%%%%%"+splited[3]);
try {
//String rowKey = splited[1] + ":" + dateFormat;
//String rowKey = splited[0] ;
//第四列的前面一位补充成为 0 凑齐6位数
String str0="00";
splited[0]=str0.substring(0,2-splited[0].length())+splited[0];
//System.out.println("%%%%%%%%%%%%"+splited[0]);
String str1="00";
splited[1]=str1.substring(0,2-splited[1].length())+splited[1];
//System.out.println(".............."+splited[1]);
String str2="00";
splited[2]=str2.substring(0,2-splited[2].length())+splited[2];
//System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"+splited[2]);
String str3="00000000000";
splited[3]=str3.substring(0,11-splited[3].length())+splited[3];
//System.out.println("%%%%%%%%%%%%"+splited[3]);
String rowKey = splited[0]+splited[1]+splited[2]+splited[3];
v2.set(rowKey + "," + value.toString());
//System.out.println("#####################" + key);
System.out.println("^^^^^^^^^^^^^^^^^^^" + rowKey);
//System.out.println(">>>>>>>>>>>>>>" + value.toString());
context.write(key, v2);
} catch (NumberFormatException e) {
final Counter counter = context.getCounter("BatchImport","ErrorFormat");
counter.increment(1L);
System.out.println("出错了" + splited[0] + " " + e.getMessage());
}
}
}
static class BatchImportReducer extends TableReducer<LongWritable, Text, NullWritable> {
protected void reduce(LongWritable key,java.lang.Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException {
for (Text text : values) {
// final String[] splited = text.toString().split("\t");
final String[] splited = text.toString().split(",");
final Put put = new Put(Bytes.toBytes(splited[0]));
// put.add(Bytes.toBytes("cf"), Bytes.toBytes("date"), Bytes.toBytes(splited[1]));
//put.add(Bytes.toBytes("data"), Bytes.toBytes("h"),Bytes.toBytes(splited[2]));
//put.add(Bytes.toBytes("data"), Bytes.toBytes("l"),Bytes.toBytes(splited[3]));
put.add(Bytes.toBytes("data"), Bytes.toBytes("h"),Bytes.toBytes(splited[5]));
put.add(Bytes.toBytes("data"), Bytes.toBytes("l"),Bytes.toBytes(splited[6]));
// 省略其他字段,调用put.add(....)即可
context.write(NullWritable.get(), put);
}
}
}
}
下面截图反应的就是一次性导入2000+csv文件时的错误,扫描表时出现的
代码参考网上的批量导入hbase的代码,连接忘记了
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class UptoHBase {
/**
* @param args
*/
public static void main(String[] args) throws java.io.IOException, InterruptedException,ClassNotFoundException{
// TODO Auto-generated method stub
final Configuration configuration = new Configuration();
// 设置zookeeper
//configuration.set("hbase.zookeeper.quorum", "hadoop1");
configuration.set("hbase.zookeeper.quorum", "localhost");
// 设置hbase表名称
// configuration.set(TableOutputFormat.OUTPUT_TABLE, "wlan_log");
//configuration.set(TableOutputFormat.OUTPUT_TABLE, "testhbase");
configuration.set(TableOutputFormat.OUTPUT_TABLE, "Bearing1_1_acc");
// 将该值改大,防止hbase超时退出
configuration.set("dfs.socket.timeout", "180000");
final Job job = new Job(configuration, "HBaseBatchImport");
job.setMapperClass(BatchImportMapper.class);
job.setReducerClass(BatchImportReducer.class);
// 设置map的输出,不设置reduce的输出类型
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
// 不再设置输出路径,而是设置输出格式类型
job.setOutputFormatClass(TableOutputFormat.class);
// FileInputFormat.setInputPaths(job, "hdfs://hadoop1:9000/input");
// FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/user/hadoop/testhbase");
FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/user/hadoop/Bearing1_1_acc");
job.waitForCompletion(true);
}
static class BatchImportMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
Text v2 = new Text();
protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException {
//final String[] splited = value.toString().split("\t");
final String[] splited = value.toString().split(",");
//System.out.println("%%%%%%%%%%%%"+splited[3]);
try {
//String rowKey = splited[1] + ":" + dateFormat;
//String rowKey = splited[0] ;
//第四列的前面一位补充成为 0 凑齐6位数
String str0="00";
splited[0]=str0.substring(0,2-splited[0].length())+splited[0];
//System.out.println("%%%%%%%%%%%%"+splited[0]);
String str1="00";
splited[1]=str1.substring(0,2-splited[1].length())+splited[1];
//System.out.println(".............."+splited[1]);
String str2="00";
splited[2]=str2.substring(0,2-splited[2].length())+splited[2];
//System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"+splited[2]);
String str3="00000000000";
splited[3]=str3.substring(0,11-splited[3].length())+splited[3];
//System.out.println("%%%%%%%%%%%%"+splited[3]);
String rowKey = splited[0]+splited[1]+splited[2]+splited[3];
v2.set(rowKey + "," + value.toString());
//System.out.println("#####################" + key);
System.out.println("^^^^^^^^^^^^^^^^^^^" + rowKey);
//System.out.println(">>>>>>>>>>>>>>" + value.toString());
context.write(key, v2);
} catch (NumberFormatException e) {
final Counter counter = context.getCounter("BatchImport","ErrorFormat");
counter.increment(1L);
System.out.println("出错了" + splited[0] + " " + e.getMessage());
}
}
}
static class BatchImportReducer extends TableReducer<LongWritable, Text, NullWritable> {
protected void reduce(LongWritable key,java.lang.Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException {
for (Text text : values) {
// final String[] splited = text.toString().split("\t");
final String[] splited = text.toString().split(",");
final Put put = new Put(Bytes.toBytes(splited[0]));
// put.add(Bytes.toBytes("cf"), Bytes.toBytes("date"), Bytes.toBytes(splited[1]));
//put.add(Bytes.toBytes("data"), Bytes.toBytes("h"),Bytes.toBytes(splited[2]));
//put.add(Bytes.toBytes("data"), Bytes.toBytes("l"),Bytes.toBytes(splited[3]));
put.add(Bytes.toBytes("data"), Bytes.toBytes("h"),Bytes.toBytes(splited[5]));
put.add(Bytes.toBytes("data"), Bytes.toBytes("l"),Bytes.toBytes(splited[6]));
// 省略其他字段,调用put.add(....)即可
context.write(NullWritable.get(), put);
}
}
}
}
下面截图反应的就是一次性导入2000+csv文件时的错误,扫描表时出现的
相关文章推荐
- 映客如此之火,殊不知已经有仿映客手机直播全新上市了!
- Head First C 学习日志 第十章 进程间通信 输入输出重定向
- 麒麟开源堡垒机安装部署测试及优缺点总结
- 算数验证码
- 定义 Model兼容python2.x和python3.x
- Django 数据导入
- Django 多数据库联用
- Django创建超级管理员
- 用pyspider爬取乌云zone的贴子,上手非常地快速 动后河 (☭) [白帽子] | 2016-04-04 12:55
- Django QuerySet API
- 一套关于 Django 的笔试题
- 基于Spring4 Hibernate4 jersey实现rest风格系统
- 修改mysql数据库的用户名和密码
- 使用navicat建立与服务器的远程mysql连接
- MongoDB常用操作命令大全
- 安装完 MySQL 后必须调整的 10 项配置
- MySQL处理数据库和表的常用命令
- 修改mysql数据库为utf8
- Elasticsearch学习笔记
- elasticsearch结合spring springmvc jest 使用做成web架构