小弟要将word数据解析出来导入数据库

小弟我要将有一定格式的word文档解析出来导入数据库，网上查的方法有POI,JACOB,openoffice,JACOB只能运行在windows下，所以放弃，打算用POI，有木有哪位大神做个类似的？求指教。 --------------------编程问答-------------------- word文档？做过EXCEL的，还不错 --------------------编程问答-------------------- excel我用poi还不错，word就纠结了 --------------------编程问答-------------------- 推荐openoffice，如果是docx的话poi是扛不住的，另doc中的公式poi是很难搞定的，对于doc openffice解析不错，对docx公式需要自己写代码+openoffice能搞定。。。 --------------------编程问答--------------------

我要读取的word 文档是一定的格式，都是文本，根据标题和内容分别存入数据库，poi 不能实现吗？ --------------------编程问答--------------------

引用 4 楼 u013114085 的回复:

我要读取的word 文档是一定的格式，都是文本，根据标题和内容分别存入数据库，poi 不能实现吗？

都是文本的放doc应该没问题，poi官方说对docx支持不好，现在不知道是什么情况了，我没试过，我想都是文本的话应该可以 --------------------编程问答-------------------- 这个就三层结构，大标题，小标题，和内容，根据这三个内容存入不同的字段，能实现吗？ --------------------编程问答-------------------- 现在我用的是poi 3.9最新版本，2007应该支持的。 --------------------编程问答--------------------

引用 7 楼 u013114085 的回复:

现在我用的是poi 3.9最新版本，2007应该支持的。

可以。有两种方式：一种是将word转为html，然后解析html。这样有个好处是可以保留word中的样式，目前我用的就是这种。另一种它应该有提取段落文字的方法，不过应该没有样式，我没有试过，不过感觉应该可以。 --------------------编程问答-------------------- 既然楼主这么有诚意,我来贴2段代码:
读取word当中的内容



package com.lk.core.util.commons;



import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;



import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.Range;



/**

 * 读取word文档的工具类,只支持word2003

 * 

 * @author zero

 * 

 */

public class WordDocUtil {

	/**

	 * 文件读取流对象

	 */

	private FileInputStream fileInputStream = null;

	/**

	 * word 文档对象

	 */

	private HWPFDocument wordDoc = null;

	/**

	 * 定义文档中图片的最小大小,因为在读取图片的时候, 出现一些非图片的文本框,也被认为是图片被读取出来了

	 */

	private static final int MIN_PICTURE_CONTEXT = 10000;



	private void setFileInputStream(FileInputStream fileInputStream) {

		this.fileInputStream = fileInputStream;

	}



	private void setWordDoc(HWPFDocument wordDoc) {

		this.wordDoc = wordDoc;

	}



	private void initWordDoc() throws NullPointerException, IOException {

		if (fileInputStream == null) {

			throw new NullPointerException("File input stream is null!");

		}



		if (wordDoc == null) {

			this.setWordDoc(new HWPFDocument(fileInputStream));

		}

	}



	/**

	 * 传入word 文档路径

	 * 

	 * @param path

	 * @throws FileNotFoundException

	 */

	public WordDocUtil(String path) throws FileNotFoundException {

		this.setFileInputStream(new FileInputStream(path));

	}



	/**

	 * 传入文件对象

	 * 

	 * @param f

	 * @throws FileNotFoundException

	 */

	public WordDocUtil(File f) throws FileNotFoundException {

		this.setFileInputStream(new FileInputStream(f));



	}



	/**

	 * 传入文件读取流

	 * 

	 * @param is

	 */

	public WordDocUtil(FileInputStream is) {

		this.setFileInputStream(is);

	}



	/**

	 * 返回word文档中的文字内容,不包含图片等信息

	 * 

	 * @return

	 * @param ignoreEmpty

	 *            是否忽略空内容的行

	 * @throws IOException

	 * @throws Exception

	 */

	public String getText(boolean ignoreEmpty) throws NullPointerException, IOException {

		initWordDoc();

		int len = this.getDocParagraphsNum();

		StringBuffer sb = new StringBuffer();

		String tmpStr = null;

		for (int i = 0; i < len; i++) {

			tmpStr = this.getParagraphsText(i);

			if (ignoreEmpty && !tmpStr.trim().isEmpty()) {

				sb.append(tmpStr);

			} else if (!ignoreEmpty) {

				sb.append(tmpStr);

			}

		}



		return sb.toString();

	}



	/**

	 * 返回文档中的段落数

	 * 

	 * @throws IOException

	 * @throws NullPointerException

	 * 

	 * @throws Exception

	 */

	public int getDocParagraphsNum() throws NullPointerException, IOException {

		initWordDoc();

		Range range = this.wordDoc.getRange();

		return range.numParagraphs();

	}



	/**

	 * 根据段落号获取段落内容,除文档之外的如图片等信息会显示不正确 段落号的起始为0

	 * 

	 * @return

	 * @throws IOException

	 * @throws NullPointerException

	 * @throws Exception

	 */

	public String getParagraphsText(int index) throws NullPointerException, IOException {

		initWordDoc();

		Range range = this.wordDoc.getRange();

		int len = range.numParagraphs();

		if (index > len - 1) {

			throw new IndexOutOfBoundsException("paragraph index is out of range.");

		}

		return range.getParagraph(index).text().toString();

	}



	/**

	 * 获取文档中的内容,按照段落返回list

	 * 

	 * @return

	 * @throws IOException

	 * @throws NullPointerException

	 */

	public List<String> getDocListText() throws NullPointerException, IOException {

		return getDocListText(null, true);

	}



	/**

	 * 获取文档中的内容,按照段落返回list

	 * 

	 * @param beginContext

	 *            开始段落的内容,从此段落开始读取数据

	 * @param needContext

	 *            : 是否一定需要返回内容,即如果没有匹配到头,则直接返回文件全内容,保证读取的准确性

	 * @return

	 * @throws IOException

	 * @throws NullPointerException

	 */

	public List<String> getDocListText(String beginContext, boolean needContext) throws NullPointerException,

			IOException {

		int len = getDocParagraphsNum();

		List<String> context = new ArrayList<String>();

		boolean findBegin = false;

		if (beginContext == null) {

			findBegin = true;

		}

		for (int i = 0; i < len; i++) {

			if (!findBegin && getParagraphsText(i).trim().equals(beginContext)) {

				findBegin = true;

			}

			if (findBegin) {

				context.add(getParagraphsText(i));

			}

		}



		if (context.size() == 0 && needContext) {

			for (int j = 0; j < len; j++) {

				context.add(getParagraphsText(j));

			}

		}



		return context;

	}



	/**

	 * 获取文档中图片的数量

	 * 

	 * @return

	 * @throws NullPointerException

	 * @throws IOException

	 */

	public int getDocPictureNum() throws NullPointerException, IOException {

		initWordDoc();

		return this.wordDoc.getPicturesTable().getAllPictures().size();

	}



	/**

	 * 获取文档中的word图片,写入到fos,图片索引从0开始

	 * 

	 * @param fos

	 * @throws IOException

	 * @throws NullPointerException

	 */

	public void getDocPicture(FileOutputStream fos, int index) throws NullPointerException, IOException {

		initWordDoc();

		List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();

		if (index > pictures.size() - 1) {

			throw new IndexOutOfBoundsException("picture index is out of range.");

		}



		Picture p = pictures.get(index);

		p.writeImageContent(fos);

		fos.close();

	}



	/**

	 * 获取word文档中的图片,可以直接写入流

	 * 

	 * @throws IOException

	 * @throws NullPointerException

	 * 

	 */

	public byte[] getDocPicture(int index) throws NullPointerException, IOException {

		initWordDoc();

		List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();

		if (index > pictures.size() - 1) {

			throw new IndexOutOfBoundsException("picture index is out of range.");

		}



		Picture p = pictures.get(index);

		return p.getContent();

	}



	/**

	 * 获取所有的文档图片,返回context

	 * 

	 * @throws IOException

	 * @throws NullPointerException

	 * 

	 */

	public List<byte[]> getAllDocPicture() throws NullPointerException, IOException {

		initWordDoc();

		List<Picture> pictures = this.wordDoc.getPicturesTable().getAllPictures();

		List<byte[]> retList = new ArrayList<byte[]>();

		for (Picture p : pictures) {

			byte[] picContext = p.getContent();

			if (picContext.length < MIN_PICTURE_CONTEXT) {

				continue;

			} else {

				retList.add(picContext);

			}

		}

		// 当前出现了图片乱序问题,即读取出来的图片顺序和实际的word里的图片顺序不一致

		// 当前这里默认设置下,第一个图片的size比第二个的大

		if (retList.size() >= 2) {

			if (retList.get(0).length < retList.get(1).length) {

				retList.add(0, retList.get(1));

				retList.remove(2);

			}

		}

		return retList;

	}

	

	public void closeStream() {

		if (this.fileInputStream != null) {

			try {

				this.fileInputStream.close();

			} catch (IOException e) {

			}

		}

	}



	@Override

	protected void finalize() throws Throwable {

		closeStream();

		super.finalize();

	}

}

--------------------编程问答-------------------- 转换word为html文件



package com.lk.core.util.commons;



import java.io.BufferedWriter;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.util.ArrayList;

import java.util.List;



import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;



import org.apache.commons.io.output.ByteArrayOutputStream;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.w3c.dom.Document;



/**

 * 利用POI插件,将word转换为html Copyright (c) 2013 by zero.

 */

public class Word2HtmlUtil {

	// 从word中提取的图片存放的文件夹名称,该文件夹和html文件同级

	private static final String IMAGE_FOLDER = "wordImage";



	/**

	 * 将内容写入文件

	 */

	private static void writeFile(String content, String path) {

		FileOutputStream fos = null;

		BufferedWriter bw = null;

		try {

			try {

				fos = new FileOutputStream(new File(path));

				bw = new BufferedWriter(new OutputStreamWriter(fos));

				bw.write(content);

			} finally {

				if (bw != null) {

					bw.close();

				}

				if (fos != null) {

					fos.close();

				}

			}

		} catch (Exception e) {

			e.printStackTrace();

		}

	}



	/**

	 * 创建保存word图片的文件夹

	 */

	private static void createOutPutFolder(String htmlFolder) {

		File folder = new File(htmlFolder);

		if (!folder.exists() || !folder.isDirectory()) {

			folder.mkdirs();

		}

		File imageFolder = new File(folder.getAbsoluteFile() + File.separator + IMAGE_FOLDER);

		if (!imageFolder.exists() || !imageFolder.isDirectory()) {

			imageFolder.mkdirs();

		}

	}



	/**

	 * 将word文档转换为html格式

	 */

	public static void convert2Html(String fileName, String outPutFile) throws TransformerException,

			IOException, ParserConfigurationException {

		String folder = new File(outPutFile).getParent();

		createOutPutFolder(folder);

		HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));

		WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory

				.newInstance().newDocumentBuilder().newDocument());

		// 用于保存word中的图片的名称

		final List<String> picNames = new ArrayList<String>();

		wordToHtmlConverter.setPicturesManager(new PicturesManager() {

			public String savePicture(byte[] content, PictureType pictureType, String suggestedName,

					float widthInches, float heightInches) {

				// 保存图片名称

				picNames.add(suggestedName);

				// 图片文件存储的相对路径

				return IMAGE_FOLDER + File.separator + suggestedName;

			}

		});

		wordToHtmlConverter.processDocument(wordDocument);

		List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();

		if (pics != null) {

			for (int i = 0; i < pics.size(); i++) {

				Picture pic = pics.get(i);

				try {

					pic.writeImageContent(new FileOutputStream(folder + File.separator + IMAGE_FOLDER

							+ File.separator + pic.suggestFullFileName()));

				} catch (FileNotFoundException e) {

					e.printStackTrace();

				}

			}

		}

		Document htmlDocument = wordToHtmlConverter.getDocument();

		ByteArrayOutputStream out = new ByteArrayOutputStream();

		DOMSource domSource = new DOMSource(htmlDocument);

		StreamResult streamResult = new StreamResult(out);



		TransformerFactory tf = TransformerFactory.newInstance();

		Transformer serializer = tf.newTransformer();

		serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

		serializer.setOutputProperty(OutputKeys.INDENT, "yes");

		serializer.setOutputProperty(OutputKeys.METHOD, "html");

		serializer.transform(domSource, streamResult);

		out.close();

		writeFile(new String(out.toByteArray()), outPutFile);

	}

}

补充：Java ,  Java EE