Hadoop 序列化
2015-12-21 16:51
197 查看
运用hadoop的序列化
在hadoop的框架中要使一个类可序列化,要实现Writable接口的两个方法:
Java代码
public interface Writable {
void write(DataOutput out) throws IOException;
void readFields(DataInput in) throws IOException;
}
比java的实现Serializable复杂很多。但是通过比较可以发现,hadoop的序列化机制产生的数据量远小于java的序列化所产生的数据量。
在这两个方法中自己控制对fileds的输入和输出。如果类中包含有其他对象的引用,那么那个对象也是要实现Writable接口的(当然也可以不实现Writable借口,只要自己处理好对对象的fileds的存贮就可以了)。
下面是一个简单的例子:
类Attribute
Java代码
package siat.miner.etl.instance
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
public class Attribute implements Writable{
public static int ATTRIBUTE_TYPE_STRING = 1;//string type
public static int ATTRIBUTE_TYPE_NOMINAL = 2;//nominal type
public static int ATTRIBUTE_TYPE_REAL = 3;//real type
private IntWritable type;
private Text name;
public IntWritable getType() {
return type;
}
public void setType(int type) {
this.type = new IntWritable(type);
}
public Text getName() {
return name;
}
public void setName(String name) {
this.name = new Text(name);
}
public Attribute() {
super();
this.type = new IntWritable(0);
this.name = new Text("");
}
public Attribute(int type, String name) {
super();
this.type = new IntWritable(type);
this.name = new Text(name);
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
type.readFields(in);
name.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
type.write(out);
name.write(out);
}
}
类TestA:
Java代码
package siat.miner.etl.test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import siat.miner.etl.instance.Attribute;
public class TestA implements Writable{
private Attribute a;
private IntWritable b;
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
Attribute a = new Attribute(Attribute.ATTRIBUTE_TYPE_NOMINAL, "name");
TestA ta = new TestA(a, new IntWritable(1));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream oos = new DataOutputStream(bos);
ta.write(oos);
TestA tb = new TestA();
tb.readFields(new DataInputStream(new ByteArrayInputStream(bos.toByteArray())));
}
public TestA(Attribute a, IntWritable b) {
super();
this.a = a;
this.b = b;
}
public TestA() {
// TODO Auto-generated constructor stub
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
a = new Attribute();
a.readFields(in);
b = new IntWritable();
b.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
a.write(out);
b.write(out);
}
}
可以看到,hadoop的序列化机制就是利用java的DataInput和DataOutput来完成对基本类型的序列化,然后让用户自己来处理对自己编写的类的序列化。
在hadoop的框架中要使一个类可序列化,要实现Writable接口的两个方法:
Java代码
public interface Writable {
void write(DataOutput out) throws IOException;
void readFields(DataInput in) throws IOException;
}
比java的实现Serializable复杂很多。但是通过比较可以发现,hadoop的序列化机制产生的数据量远小于java的序列化所产生的数据量。
在这两个方法中自己控制对fileds的输入和输出。如果类中包含有其他对象的引用,那么那个对象也是要实现Writable接口的(当然也可以不实现Writable借口,只要自己处理好对对象的fileds的存贮就可以了)。
下面是一个简单的例子:
类Attribute
Java代码
package siat.miner.etl.instance
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
public class Attribute implements Writable{
public static int ATTRIBUTE_TYPE_STRING = 1;//string type
public static int ATTRIBUTE_TYPE_NOMINAL = 2;//nominal type
public static int ATTRIBUTE_TYPE_REAL = 3;//real type
private IntWritable type;
private Text name;
public IntWritable getType() {
return type;
}
public void setType(int type) {
this.type = new IntWritable(type);
}
public Text getName() {
return name;
}
public void setName(String name) {
this.name = new Text(name);
}
public Attribute() {
super();
this.type = new IntWritable(0);
this.name = new Text("");
}
public Attribute(int type, String name) {
super();
this.type = new IntWritable(type);
this.name = new Text(name);
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
type.readFields(in);
name.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
type.write(out);
name.write(out);
}
}
类TestA:
Java代码
package siat.miner.etl.test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import siat.miner.etl.instance.Attribute;
public class TestA implements Writable{
private Attribute a;
private IntWritable b;
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
Attribute a = new Attribute(Attribute.ATTRIBUTE_TYPE_NOMINAL, "name");
TestA ta = new TestA(a, new IntWritable(1));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream oos = new DataOutputStream(bos);
ta.write(oos);
TestA tb = new TestA();
tb.readFields(new DataInputStream(new ByteArrayInputStream(bos.toByteArray())));
}
public TestA(Attribute a, IntWritable b) {
super();
this.a = a;
this.b = b;
}
public TestA() {
// TODO Auto-generated constructor stub
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
a = new Attribute();
a.readFields(in);
b = new IntWritable();
b.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
a.write(out);
b.write(out);
}
}
可以看到,hadoop的序列化机制就是利用java的DataInput和DataOutput来完成对基本类型的序列化,然后让用户自己来处理对自己编写的类的序列化。
相关文章推荐
- linux系统编译C++程序时头文件和库文件搜索路径
- linux-centos挂载新硬盘操作
- linux查看日志常用的一个命令
- Linux和windows下内核socket优化项 (转)
- 在Mac OS上搭建Nginx+PHP+MySQL开发环境的教程
- Linux下C语言SOCKET中accpet返回值。
- Install wkhtmltopdf On CentOS 6.5
- linux中查看服务器名称的命令 -- hostname
- linux系统的时间调整
- Linux实践篇--linux软件的安装,更新与卸载
- Linux实践篇--linux软件的安装,更新与卸载
- 简单实现linux聊天室程序
- java cxf org.apache.cxf.interceptor.Fault: Unmarshalling Error: null
- 在Linux下编译生成动态库*.so后发现用dlopen打开时出错解决办法
- Ubuntu Nginx安装
- 【转】shell脚本中echo显示内容带颜色
- Linux实践篇--crontab定时任务
- Linux实践篇--crontab定时任务
- iOS 系统架构及常用框架(iOS的系统架构分为四个层次)
- centos7安装mysql5.7的终极解决方案