您的位置:首页 > 编程语言 > PHP开发

第一个“搜索引擎”【预处理】

2008-04-14 22:22 190 查看
在eclipse中导入包,方法直接复制到工程后用右键中的bulid path中加入即可 

下面开始我们的实例吧

package ch2.lucenedemo.preprocess;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
创造类

 

public class FilePreprocess
{
    public static void main(String[] args) 
    {
        String inputFile="d:/book.txt";
        
        String outputDir="d:/textfolder/";
        if(!new File(outputDir).exists())
            new File(outputDir).mkdirs();
        
        FilePreprocess filePreprocess=new FilePreprocess();
        filePreprocess.preprocess(new File(inputFile),outputDir);

    }
        …………
}
 

讲一下mkdirs()吧

mkdirs

public boolean mkdirs()

Creates the directory named by this abstract pathname, including any necessary but nonexistent parent directories. Note that if this operation fails it may have succeeded in creating some of the necessary parent directories.
 Returns:

true
if and only if the directory was created, along with all necessary parent directories;
false
otherwise
 

public static File charactorProcess(File file,String destFile) throws Exception
    {
        //创建一个输出流,用于写新文件
        BufferedWriter writer=new BufferedWriter(new FileWriter(destFile));
        //创建一个输入流,用于读取文件
        BufferedReader reader=new BufferedReader(new FileReader(file));
        //这个好像是把字节流转换为字符流,然后再读入
        String line=reader.readLine();
        
        while(line!=null)
        {
            if(!line.equals(" "))      // :   换行    :   回车换行 其实有没有这句无所谓
            {
                //调用replace方法替换所有的全角字符
                String newline=replace(line);
                //将替换好的String写入新的文件
                writer.write(newline);
                //写入行分割符号
                writer.newLine();
            }
            line=reader.readLine();
        }
        //关闭输入输出流
        reader.close();
        writer.close();
        //返回文件
        return new File(destFile);
    }
replace类

 

private static String replace(String line){
        //创建一个HashMap存储全角字符和半角字符之间的对应关系;
        HashMap map=new HashMap();
        map.put(",",",");
        map.put("。",",");
        map.put("<","<");
        map.put(">",">");
        map.put("","|");
        map.put("《","《");
        map.put("》","》");
        map.put("[","[");
        map.put("]","]");
        map.put("?","?");
        map.put("“",""");
        map.put("”",""");
        map.put(":",":");
        map.put("(","(");
        map.put(")",")");
        map.put("【","[");
        map.put("】","]");
        map.put("、",",");
        map.put("——","-");
        map.put("~","~");
        map.put("!","!");
        map.put("‘","'");
        
        int length=line.length();
        for(int i=0;i<length;i++)
        {
            //逐个取得长度为1的String
            String charat=line.substring(i,i+1);
            //判断是否存在该charat;
            if(map.get(charat)!=null)
            {
                //如果存在,说明是全角字符,转换!
                line=line.replace(charat,(String)map.get(charat));
            }
        }
        //这个不能丢啊
        return line;
    }
切分文件的类

 

public static void spiltToSmallFiles(File file,String outputpath)throws Exception
    {
        //计数器,
        int filePointer=0;
        int MAX_SIZE=10240;
        
        BufferedWriter writer=null;
        BufferedReader reader=new BufferedReader(new FileReader(file));
        StringBuffer buffer=new StringBuffer();
        String line=reader.readLine();
        //循环遍历读取的每行字符串
        while(line!=null)
        {
            buffer.append(line).append(" ");
            //判断缓冲区的长度是否大于最大长度
            if(buffer.toString().getBytes().length>=MAX_SIZE)
            {
                writer=new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt"));
                writer.write(buffer.toString());
                writer.close();
                filePointer++;
                //清空缓冲区,不能缺的
                buffer=new StringBuffer();
            }
            //继续读入行
            line=reader.readLine();
        }
        //最后直接将还没有读完的输入写入文件
        writer=new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+".txt"));
        writer.write(buffer.toString());
        writer.close();
    }buffer.toString()返回数据的字符串形式
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息