POI读取word转换html
2015-08-21 12:09
686 查看
apache POI读取word文档的文档比较少,所以只有自己慢慢的摸索,这篇文章也属于比较基础入门的,主要是针对读取word中的图片,以及文字的各种样式,如有不好的地方,请各位多多指教!
转自:http://z276356445t.iteye.com/blog/963950
/** * */ package com.util; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; /** * * @author 张廷 下午10:36:40 * */ public class WordToHtml { /** * 回车符ASCII码 */ private static final short ENTER_ASCII = 13; /** * 空格符ASCII码 */ private static final short SPACE_ASCII = 32; /** * 水平制表符ASCII码 */ private static final short TABULATION_ASCII = 9; private String htmlText = ""; /** * 读取每个文字样式 * * @param fileName * @throws Exception */ public void getWordAndStyle(String fileName) throws Exception { FileInputStream in = new FileInputStream(new File(fileName)); HWPFDocument doc = new HWPFDocument(in); // 取得文档中字符的总数 int length = doc.characterLength(); // 创建图片容器 PicturesTable pTable = doc.getPicturesTable(); htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>"; // 创建临时字符串,好加以判断一串字符是否存在相同格式 String tempString = ""; for (int i = 0; i < length - 1; i++) { // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围 Range range = new Range(i, i + 1, doc); CharacterRun cr = range.getCharacterRun(0); if (pTable.hasPicture(cr)) { // 读写图片 this.readPicture(pTable, cr); } else { Range range2 = new Range(i + 1, i + 2, doc); // 第二个字符 CharacterRun cr2 = range2.getCharacterRun(0); // 当前字符 char currentChar = cr.text().charAt(0); // 判断是否为回车符 if (currentChar == ENTER_ASCII) tempString += "<br/>"; // 判断是否为空格符 else if (currentChar == SPACE_ASCII) tempString += " "; // 判断是否为水平制表符 else if (currentChar == TABULATION_ASCII) tempString += " "; // 比较前后2个字符是否具有相同的格式 boolean flag = compareCharStyle(cr, cr2); String fontStyle = "<span style='font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;"; if (cr.isBold()) fontStyle += "font-weight:bold;"; if (cr.isItalic()) fontStyle += "font-style:italic;"; if (flag && i != length - 2) tempString += currentChar; else if (!flag) { htmlText += fontStyle + "'>" + tempString + currentChar + "</span>"; tempString = ""; } else htmlText += fontStyle + "'>" + tempString + currentChar + "</span>"; } htmlText += "</body></html>"; this.writeFile(htmlText); } /** * 读写文档中的图片 * * @param pTable * @param cr * @throws Exception */ private void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception { // 提取图片 Picture pic = pTable.extractPicture(cr, false); // 返回POI建议的图片文件名 String afileName = pic.suggestFullFileName(); OutputStream out = new FileOutputStream(new File("g:\\test" + File.separator + afileName)); pic.writeImageContent(out); htmlText += "<img src='g:\\test\\" + afileName + "'/>"; } private boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2) { boolean flag = false; if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize()) { flag = true; } return flag; } /** * 写文件 * * @param s */ private void writeFile(String s) { FileOutputStream fos = null; BufferedWriter bw = null; try { File file = new File("g:\\abc.html"); fos = new FileOutputStream(file); bw = new BufferedWriter(new OutputStreamWriter(fos)); bw.write(s); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { } } } }
转自:http://z276356445t.iteye.com/blog/963950
相关文章推荐
- 前期预科html学习(二)
- pdf格式的电子如何转换成html
- HTML中的IE条件注释
- Web学习之HTML
- c#蜘蛛程序之HTML解析利器HtmlAgilityPack
- CDHTMLDialog调用注意
- xml理论学习总结
- XML和HTML重点小解
- HTML表单
- HTML【1】的学习,实用编程+代码
- XML与HTML
- Html标签
- 在字符串资源文件中添加HTML元素,直接使用字符串资源,HTML元素没起作用的解决办法
- html 标签总结
- 19、XHTML
- 18、HTML
- MVC的Views中使用递归生成Html【转】
- DOM(一)-04-(DHTML概述)
- 防微博内容展示,使用Html.fromHtml(),解决内容不能换行的问题
- BeautifulSoup中各种html解析器的比较及使用