您的位置:首页 > 编程语言 > Java开发

java提取文档纯文本

2018-04-12 08:06 519 查看
package com.linzl.cn.convert;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

/**
* apache 读取文档纯文本内容
* tika包下载 http://archive.apache.org/dist/tika/ * @author linzl
*
*/
public class GetPlainTextUtil {
private String str = "D:/测试目录/pureText/";

public String parseToString() throws IOException, SAXException,
TikaException {
File file = new File(str + "openWindow.zip");
InputStream stream = new FileInputStream(file);
Tika tika = new Tika();
try {
return tika.parseToString(stream);
} finally {
stream.close();
}
}

public String parseToPlainText() throws IOException, SAXException,
TikaException {
BodyContentHandler handler = new BodyContentHandler();

File file = new File(str + "Zip.zip");
file = new File(str + "html.html");
// file = new File(str + "Java.java");
// file = new File(str + "Odt.odt");
file = new File(str + "2007.dotx");
file = new File(str + "2007.potx");
file = new File(str + "2007.xltx");

InputStream stream = new FileInputStream(file);
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(stream, handler, metadata);
return handler.toString();
} finally {
stream.close();
}
}

public static void main(String[] args) throws IOException, SAXException,
TikaException {
long start = System.currentTimeMillis();
// 获取到的纯文本内容含有大量的换行,需要进行修改
String content = new GetPlainTextUtil().parseToPlainText();
System.out.println(content);
long end = System.currentTimeMillis();
// parseToStringExample
System.out.println("时间:" + (end - start));
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: