您的位置:首页 > 其它

2013年阿里巴巴一道笔试题(大文件处理)

2016-04-25 23:50 330 查看
现有一个亿级别数据量的文件,其中有按key升序的记录,现要求通过输入key查找对应的记录。

对于这种大文件读取,在读取时一般要采用内存文件映射, 另外,通常的处理操作就是对文件进行分隔。 把文件分隔为若干小文件后,记录下每个小文件中最小的key值,然后把输入值与这些key值依次比较便可以找到key对应的记录所在的小文件,然后把小文件读入内存,进行二分查找。

下面是所有的程序代码,为方便,文件中仅仅记录了key。

/*
* 分别采用分割文件,在子文件中进行二分查找的方法 和 直接进行键值查找的方法进行时间对比
*/
package com.alibaba;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.IntBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.TreeMap;
import java.util.Map.Entry;

public class SortBigFile {

//单个文件大小
private static final int FILE_SIZE = 1024*1024*10;

//整数个数
private static final int numOfInt = 100000000;

//分隔文件存储目录
private static final String divPath = "E:\\divide";

//记录分隔文件后每个文件的最小值,及对应的文件路径
private TreeMap<Integer,String> fileMap = new TreeMap<Integer,String>();

//待查找的大文件路径
private String filePath;

public SortBigFile(String path){
this.filePath = path;
}

//写入大文件,做测试
public void writeInt()
{
FileChannel fc = null;
IntBuffer ib = null;
try {
fc = new RandomAccessFile(filePath,"rw").getChannel();
ib = fc.map(FileChannel.MapMode.READ_WRITE, 0, numOfInt*4).asIntBuffer();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

for(int i=0;i<numOfInt;++i)
ib.put(i);
try {
fc.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//对大文件进行分隔
public void splitFile()
{
FileChannel fc = null;
IntBuffer out = null;
try {
fc = new RandomAccessFile(filePath,"r").getChannel();
out = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()).asIntBuffer();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//对文件进行分隔
try {
for(int i=0;i<fc.size()/FILE_SIZE+1;++i)
{
String path = divPath+"\\"+System.currentTimeMillis()+".tmp";
DataOutputStream dos = null;
try {
dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(path))));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
for(int j=0;j<FILE_SIZE/4;++j)
{
if(out.hasRemaining())
{
int num = out.get();
if(j==0)
{
fileMap.put(num, path);
}
dos.writeInt(num);
}
}
dos.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
fc.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

//对大文件进行键值查找
public String findKey(int key)
{
int fnum;
String fpath="";
boolean flag = false;
for(Entry<Integer,String> entry:fileMap.entrySet())
{
int num = entry.getKey();
String path = entry.getValue();
if(num==key)
return path+":first num";
else if(num<key)
{
fnum = num; fpath = path;
}
else
{
flag = true;
break;
}
}

if(fpath.isEmpty()||!flag)
{
return "find nothing";
}

int index = binarySearch(fpath, key);
if(index==-1)
return "find nothing";
else
return fpath+":"+index+" num";
}

//对子文件进行二分查找
private int binarySearch(String path, int key)
{
int index = 0;
ArrayList<Integer> nums = new ArrayList<Integer>();
DataInputStream dos = null;
try {
dos = new DataInputStream(new BufferedInputStream(new FileInputStream(new File(path))));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
while(true)
{
int num;
try {
num = dos.readInt();
nums.add(num);
} catch (EOFException e) {
// TODO Auto-generated catch block
//文件结尾
break;
}
catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
int start =0, end = nums.size()-1;
while(start<end)
{
int mid = start + (end-start)/2;
if(nums.get(mid)==key)
return mid;
else if(nums.get(mid)>key)
{
end = mid;
}
else
{
start = mid;
}
}
return -1;

}

//采用直接内存映射读入文件,然后进行比较得到键值的方法
public int dirFind(int key)
{
FileChannel fc = null;
IntBuffer out = null;
try {
fc = new RandomAccessFile(filePath,"r").getChannel();
out = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()).asIntBuffer();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
int index = 0;
while(out.hasRemaining())
{
int num = out.get();
if(num==key)
return index;
else if(num<key)
{
++index;
}
else
{
return -1;
}
}
return -1;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
SortBigFile bigfile = new SortBigFile("E:\\test.tmp");
long start = 0;
long end = 0;
long withoutDiv = 0;

bigfile.writeInt();

//计算文件分割,然后二分查找键值的时间
start = System.currentTimeMillis();
bigfile.splitFile();
withoutDiv = System.currentTimeMillis();
System.out.println(bigfile.findKey(88732723));
end = System.currentTimeMillis();
System.out.println("with divide file time:"+(end-start)/1000);
System.out.println("without divide file time:"+(end-withoutDiv)/1000);

System.out.println("---------------");
//不进行分割,直接查找的时间
start = System.currentTimeMillis();
System.out.println("Index:"+ bigfile.dirFind(88732723));
end = System.currentTimeMillis();
System.out.println((end-start)/1000);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: