小弟要将word数据解析出来导入数据库
小弟我要将有一定格式的word文档解析出来导入数据库,网上查的方法有POI,JACOB,openoffice,JACOB只能运行在windows下,所以放弃,打算用POI,有木有哪位大神做个类似的?求指教。 --------------------编程问答-------------------- word文档? 做过EXCEL的,还不错 --------------------编程问答-------------------- excel我用poi还不错,word就纠结了 --------------------编程问答-------------------- 推荐openoffice,如果是docx的话poi是扛不住的,另doc中的公式poi是很难搞定的,对于doc openffice解析不错,对docx公式需要自己写代码+openoffice能搞定。。。 --------------------编程问答--------------------我要读取的word 文档是一定的格式,都是文本,根据标题和内容分别存入数据库,poi 不能实现吗? --------------------编程问答--------------------
都是文本的放doc应该没问题,poi官方说对docx支持不好,现在不知道是什么情况了,我没试过,我想都是文本的话应该可以 --------------------编程问答-------------------- 这个就三层结构,大标题,小标题,和内容,根据这三个内容存入不同的字段,能实现吗? --------------------编程问答-------------------- 现在我用的是poi 3.9最新版本,2007应该支持的。 --------------------编程问答-------------------- 可以。有两种方式 :一种是将word转为html,然后解析html。这样有个好处是可以保留word中的样式,目前我用的就是这种。另一种它应该有提取段落文字的方法,不过应该没有样式,我没有试过,不过感觉应该可以。 --------------------编程问答-------------------- 既然楼主这么有诚意,我来贴2段代码:
读取word当中的内容
--------------------编程问答-------------------- 转换word为html文件
package com.lk.core.util.commons;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
/**
* 读取word文档的工具类,只支持word2003
*
* @author zero
*
*/
public class WordDocUtil {
/**
* 文件读取流对象
*/
private FileInputStream fileInputStream = null;
/**
* word 文档对象
*/
private HWPFDocument wordDoc = null;
/**
* 定义文档中图片的最小大小,因为在读取图片的时候, 出现一些非图片的文本框,也被认为是图片被读取出来了
*/
private static final int MIN_PICTURE_CONTEXT = 10000;
private void setFileInputStream(FileInputStream fileInputStream) {
this.fileInputStream = fileInputStream;
}
private void setWordDoc(HWPFDocument wordDoc) {
this.wordDoc = wordDoc;
}
private void initWordDoc() throws NullPointerException, IOException {
if (fileInputStream == null) {
throw new NullPointerException("File input stream is null!");
}
if (wordDoc == null) {
this.setWordDoc(new HWPFDocument(fileInputStream));
}
}
/**
* 传入word 文档路径
*
* @param path
* @throws FileNotFoundException
*/
public WordDocUtil(String path) throws FileNotFoundException {
this.setFileInputStream(new FileInputStream(path));
}
/**
* 传入文件对象
*
* @param f
* @throws FileNotFoundException
*/
public WordDocUtil(File f) throws FileNotFoundException {
this.setFileInputStream(new FileInputStream(f));
}
/**
* 传入文件读取流
*
* @param is
*/
public WordDocUtil(FileInputStream is) {
this.setFileInputStream(is);
}
/**
* 返回word文档中的文字内容,不包含图片等信息
*
* @return
* @param ignoreEmpty
* 是否忽略空内容的行
* @throws IOException
* @throws Exception
*/
public String getText(boolean ignoreEmpty) throws NullPointerException, IOException {
initWordDoc();
int len = this.getDocParagraphsNum();
StringBuffer sb = new StringBuffer();
String tmpStr = null;
for (int i = 0; i < len; i++) {
tmpStr = this.getParagraphsText(i);
if (ignoreEmpty && !tmpStr.trim().isEmpty()) {
sb.append(tmpStr);
} else if (!ignoreEmpty) {
sb.append(tmpStr);
}
}
return sb.toString();
}
/**
* 返回文档中的段落数
*
* @throws IOException
* @throws NullPointerException
*
* @throws Exception
*/
public int getDocParagraphsNum() throws NullPointerException, IOException {
initWordDoc();
Range range = this.wordDoc.getRange();
return range.numParagraphs();
}
/**
* 根据段落号获取段落内容,除文档之外的如图片等信息会显示不正确 段落号的起始为0
*
* @return
* @throws IOException
* @throws NullPointerException
* @throws Exception
*/
public String getParagraphsText(int index) throws NullPointerException, IOException {
initWordDoc();
Range range = this.wordDoc.getRange();
int len = range.numParagraphs();
if (index > len - 1) {
throw new IndexOutOfBoundsException("paragraph index is out of range.");
}
return range.getParagraph(index).text().toString();
}
/**
* 获取文档中的内容,按照段落返回list
*
* @return
* @throws IOException
* @throws NullPointerException
*/
public List<String> getDocListText() throws NullPointerException, IOException {
return getDocListText(null, true);
}
/**
* 获取文档中的内容,按照段落返回list
*
* @param beginContext
* 开始段落的内容,从此段落开始读取数据
* @param needContext
* : 是否一定需要返回内容,即如果没有匹配到头,则直接返回文件全内容,保证读取的准确性
* @return
* @throws IOException
* @throws NullPointerException
*/
public List<String> getDocListText(String beginContext, boolean needContext) throws NullPointerException,
IOException {
int len = getDocParagraphsNum();
List<String> context = new ArrayList<String>();
boolean findBegin = false;
if (beginContext == null) {
findBegin = true;
}
for (int i = 0; i < len; i++) {
if (!findBegin && getParagraphsText(i).trim().equals(beginContext)) {
findBegin = true;
}
if (findBegin) {
context.add(getParagraphsText(i));
}
}
if (context.size() == 0 && needContext) {
for (int j = 0; j < len; j++) {
context.add(getParagraphsText(j));
}
}
return context;
}
/**
* 获取文档中图片的数量
*
* @return
* @throws NullPointerException
* @throws IOException
*/
public int getDocPictureNum() throws NullPointerException, IOException {
initWordDoc();
return this.wordDoc.getPicturesTable().getAllPictures().size();
}
/**
* 获取文档中的word图片,写入到fos,图片索引从0开始
*
* @param fos
* @throws IOException
* @throws NullPointerException
*/
public void getDocPicture(FileOutputStream fos, int index) throws NullPointerException, IOException {
initWordDoc();
List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();
if (index > pictures.size() - 1) {
throw new IndexOutOfBoundsException("picture index is out of range.");
}
Picture p = pictures.get(index);
p.writeImageContent(fos);
fos.close();
}
/**
* 获取word文档中的图片,可以直接写入流
*
* @throws IOException
* @throws NullPointerException
*
*/
public byte[] getDocPicture(int index) throws NullPointerException, IOException {
initWordDoc();
List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();
if (index > pictures.size() - 1) {
throw new IndexOutOfBoundsException("picture index is out of range.");
}
Picture p = pictures.get(index);
return p.getContent();
}
/**
* 获取所有的文档图片,返回context
*
* @throws IOException
* @throws NullPointerException
*
*/
public List<byte[]> getAllDocPicture() throws NullPointerException, IOException {
initWordDoc();
List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();
List<byte[]> retList = new ArrayList<byte[]>();
for (Picture p : pictures) {
byte[] picContext = p.getContent();
if (picContext.length < MIN_PICTURE_CONTEXT) {
continue;
} else {
retList.add(picContext);
}
}
// 当前出现了图片乱序问题,即读取出来的图片顺序和实际的word里的图片顺序不一致
// 当前这里默认设置下,第一个图片的size比第二个的大
if (retList.size() >= 2) {
if (retList.get(0).length < retList.get(1).length) {
retList.add(0, retList.get(1));
retList.remove(2);
}
}
return retList;
}
public void closeStream() {
if (this.fileInputStream != null) {
try {
this.fileInputStream.close();
} catch (IOException e) {
}
}
}
@Override
protected void finalize() throws Throwable {
closeStream();
super.finalize();
}
}
package com.lk.core.util.commons;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;
/**
* 利用POI插件,将word转换为html Copyright (c) 2013 by zero.
*/
public class Word2HtmlUtil {
// 从word中提取的图片存放的文件夹名称,该文件夹和html文件同级
private static final String IMAGE_FOLDER = "wordImage";
/**
* 将内容写入文件
*/
private static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
try {
fos = new FileOutputStream(new File(path));
bw = new BufferedWriter(new OutputStreamWriter(fos));
bw.write(content);
} finally {
if (bw != null) {
bw.close();
}
if (fos != null) {
fos.close();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 创建保存word图片的文件夹
*/
private static void createOutPutFolder(String htmlFolder) {
File folder = new File(htmlFolder);
if (!folder.exists() || !folder.isDirectory()) {
folder.mkdirs();
}
File imageFolder = new File(folder.getAbsoluteFile() + File.separator + IMAGE_FOLDER);
if (!imageFolder.exists() || !imageFolder.isDirectory()) {
imageFolder.mkdirs();
}
}
/**
* 将word文档转换为html格式
*/
public static void convert2Html(String fileName, String outPutFile) throws TransformerException,
IOException, ParserConfigurationException {
String folder = new File(outPutFile).getParent();
createOutPutFolder(folder);
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory
.newInstance().newDocumentBuilder().newDocument());
// 用于保存word中的图片的名称
final List<String> picNames = new ArrayList<String>();
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName,
float widthInches, float heightInches) {
// 保存图片名称
picNames.add(suggestedName);
// 图片文件存储的相对路径
return IMAGE_FOLDER + File.separator + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = pics.get(i);
try {
pic.writeImageContent(new FileOutputStream(folder + File.separator + IMAGE_FOLDER
+ File.separator + pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
writeFile(new String(out.toByteArray()), outPutFile);
}
}
补充:Java , Java EE