POI 关于对 ms word的读写代码

wanjianfei

浏览: 306137 次
性别:
来自: 北京

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

J2EE

Excel Apache F#

POI 关于对 ms word的读写代码

关键字: poi

read word:

Java代码

publicclassWordExtractor{
publicWordExtractor(){
}
publicStringextractText(InputStreamin)throwsIOException{
ArrayListtext=newArrayList();
POIFSFileSystemfsys=newPOIFSFileSystem(in);
DocumentEntryheaderProps=(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStreamdin=fsys.createDocumentInputStream("WordDocument");
byte[]header=newbyte[headerProps.getSize()];
din.read(header);
din.close();
//Prendeleinformazionidall'headerdeldocumento
intinfo=LittleEndian.getShort(header,0xa);
booleanuseTable1=(info&0x200)!=0;
//booleanuseTable1=true;
//Prendeinformazionidallapiecetable
intcomplexOffset=LittleEndian.getInt(header,0x1a2);
//intcomplexOffset=LittleEndian.getInt(header);
StringtableName=null;
if(useTable1){
tableName="1Table";
}else{
tableName="0Table";
}
DocumentEntrytable=(DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[]tableStream=newbyte[table.getSize()];
din=fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
din=null;
fsys=null;
table=null;
headerProps=null;
intmultiple=findText(tableStream,complexOffset,text);
StringBuffersb=newStringBuffer();
intsize=text.size();
tableStream=null;
for(intx=0;x<size;x++){
WordTextPiecenextPiece=(WordTextPiece)text.get(x);
intstart=nextPiece.getStart();
intlength=nextPiece.getLength();
booleanunicode=nextPiece.usesUnicode();
StringtoStr=null;
if(unicode){
toStr=newString(header,start,length*multiple,"UTF-16LE");
}else{
toStr=newString(header,start,length,"ISO-8859-1");
}
sb.append(toStr).append("");
}
returnsb.toString();
}
privatestaticintfindText(byte[]tableStream,intcomplexOffset,ArrayListtext)
throwsIOException{
//actualtext
intpos=complexOffset;
intmultiple=2;
//skipsthroughtheprmsbeforewereachthepiecetable.Thesecontaindata
//foractualfastsavedfiles
while(tableStream[pos]==1){
pos++;
intskip=LittleEndian.getShort(tableStream,pos);
pos+=2+skip;
}
if(tableStream[pos]!=2){
thrownewIOException("corruptedWordfile");
}else{
//parseoutthetextpieces
intpieceTableSize=LittleEndian.getInt(tableStream,++pos);
pos+=4;
intpieces=(pieceTableSize-4)/12;
for(intx=0;x<pieces;x++){
intfilePos=
LittleEndian.getInt(tableStream,pos+((pieces+1)*4)+(x*8)+2);
booleanunicode=false;
if((filePos&0x40000000)==0){
unicode=true;
}else{
unicode=false;
multiple=1;
filePos&=~(0x40000000);//givesmeFCindocstream
filePos/=2;
}
inttotLength=
LittleEndian.getInt(tableStream,pos+(x+1)*4)
-LittleEndian.getInt(tableStream,pos+(x*4));
WordTextPiecepiece=newWordTextPiece(filePos,totLength,unicode);
text.add(piece);
}
}
returnmultiple;
}
publicstaticvoidmain(String[]args){
WordExtractorw=newWordExtractor();
POIFSFileSystemps=newPOIFSFileSystem();
try{
Filefile=newFile("C:\\test.doc");
InputStreamin=newFileInputStream(file);
Strings=w.extractText(in);
System.out.println(s);
}catch(Exceptione){
e.printStackTrace();
}
}
}
classWordTextPiece{
privateint_fcStart;
privateboolean_usesUnicode;
privateint_length;
publicWordTextPiece(intstart,intlength,booleanunicode){
_usesUnicode=unicode;
_length=length;
_fcStart=start;
}
publicbooleanusesUnicode(){
return_usesUnicode;
}
publicintgetStart(){
return_fcStart;
}
publicintgetLength(){
return_length;
}
}

public class WordExtractor {
	public WordExtractor() {
	}

	public String extractText(InputStream in) throws IOException {
		ArrayList text = new ArrayList();
		POIFSFileSystem fsys = new POIFSFileSystem(in);

		DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
		DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
		byte[] header = new byte[headerProps.getSize()];

		din.read(header);
		din.close();
		// Prende le informazioni dall'header del documento
		int info = LittleEndian.getShort(header, 0xa);

		boolean useTable1 = (info & 0x200) != 0;

		//boolean useTable1 = true;
		
		// Prende informazioni dalla piece table
		int complexOffset = LittleEndian.getInt(header, 0x1a2);
		//int complexOffset = LittleEndian.getInt(header);
		
		String tableName = null;
		if (useTable1) {
			tableName = "1Table";
		} else {
			tableName = "0Table";
		}

		DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
		byte[] tableStream = new byte[table.getSize()];

		din = fsys.createDocumentInputStream(tableName);

		din.read(tableStream);
		din.close();

		din = null;
		fsys = null;
		table = null;
		headerProps = null;

		int multiple = findText(tableStream, complexOffset, text);

		StringBuffer sb = new StringBuffer();
		int size = text.size();
		tableStream = null;

		for (int x = 0; x < size; x++) {
			
			WordTextPiece nextPiece = (WordTextPiece) text.get(x);
			int start = nextPiece.getStart();
			int length = nextPiece.getLength();

			boolean unicode = nextPiece.usesUnicode();
			String toStr = null;
			if (unicode) {
				toStr = new String(header, start, length * multiple, "UTF-16LE");
			} else {
				toStr = new String(header, start, length, "ISO-8859-1");
			}
			sb.append(toStr).append(" ");

		}
		return sb.toString();
	}

	private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
		throws IOException {
		//actual text
		int pos = complexOffset;
		int multiple = 2;
		//skips through the prms before we reach the piece table. These contain	data
		//for actual fast saved files
		while (tableStream[pos] == 1) {
			pos++;
			int skip = LittleEndian.getShort(tableStream, pos);
			pos += 2 + skip;
		}
		if (tableStream[pos] != 2) {
			throw new IOException("corrupted Word file");
		} else {
			//parse out the text pieces
			int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
			pos += 4;
			int pieces = (pieceTableSize - 4) / 12;
			for (int x = 0; x < pieces; x++) {
				int filePos =
					LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);
				boolean unicode = false;
				if ((filePos & 0x40000000) == 0) {
					unicode = true;
				} else {
					unicode = false;
					multiple = 1;
					filePos &= ~(0x40000000); //gives me FC in doc stream
					filePos /= 2;
				}
				int totLength =
					LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
						- LittleEndian.getInt(tableStream, pos + (x * 4));

				WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
				text.add(piece);

			}

		}
		return multiple;
	}
	public static void main(String[] args){
		WordExtractor w  = new WordExtractor();
		POIFSFileSystem ps = new POIFSFileSystem();
		try{
			
			File file = new File("C:\\test.doc");
			
			InputStream in = new FileInputStream(file);
			String s = w.extractText(in);
			System.out.println(s);
	
			
		}catch(Exception e){
			e.printStackTrace();
		}
				
	}

}
class WordTextPiece {
	private int _fcStart;
	private boolean _usesUnicode;
	private int _length;

	public WordTextPiece(int start, int length, boolean unicode) {
		_usesUnicode = unicode;
		_length = length;
		_fcStart = start;
	}
	public boolean usesUnicode() {
		return _usesUnicode;
	}

	public int getStart() {
		return _fcStart;
	}
	public int getLength() {
		return _length;
	}

}

write word

Java代码

publicbooleanwriteWordFile(Stringpath,Stringcontent){
booleanw=false;
try{
//byteb[]=content.getBytes("ISO-8859-1");
byteb[]=content.getBytes();
ByteArrayInputStreambais=newByteArrayInputStream(b);
POIFSFileSystemfs=newPOIFSFileSystem();
DirectoryEntrydirectory=fs.getRoot();
DocumentEntryde=directory.createDocument("WordDocument",bais);
FileOutputStreamostream=newFileOutputStream(path);
fs.writeFilesystem(ostream);
bais.close();
ostream.close();
}catch(IOExceptione){
e.printStackTrace();
}
returnw;
}

	public boolean writeWordFile(String path, String content) {
		boolean w = false;
		try {
	
		//	byte b[] = content.getBytes("ISO-8859-1");
			byte b[] = content.getBytes();
			
			ByteArrayInputStream bais = new ByteArrayInputStream(b);

			POIFSFileSystem fs = new POIFSFileSystem();
			DirectoryEntry directory = fs.getRoot();

			DocumentEntry de = directory.createDocument("WordDocument", bais);

			FileOutputStream ostream = new FileOutputStream(path);

			fs.writeFilesystem(ostream);
			
			bais.close();
			ostream.close();

		} catch (IOException e) {
			e.printStackTrace();
		}

		return w;
	}

写操作的代码还是有些问题:打开WORD时提示要选择字符类型
希望能改进!

当然这几个jar是少不了的
poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar

09:21
浏览 (12760)
论坛浏览 (14400)
评论 (11)
分类: 心得
收藏
相关推荐

11 楼 cleanboxer 2007-06-18 回复

开发poi word 的那个哥们老早就不干了,好像就职于商业的公司,apache还招募人参与呢,2006的事, poi处理word 太弱,还得用vs6.0 c++, java的开源的那个word有更详细的文档,不过没研究过,处理excel还算凑合,
不过也问题比较多,不能区分单元格内内容的格式,或错误判断

10 楼 jlusdy 2007-06-15 回复

问个问题，写work java哪个开源包比较好
我看POI对word支持不太够啊

9 楼 andyandyandy 2007-04-02 回复

写word的少见,收了

8 楼 strongkill 2007-04-02 回复

http://jakarta.apache.org/poi/index.html

7 楼 dy.f 2007-03-15 回复

poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar

能提供这几个包的下载吗？

6 楼 transist 2007-03-06 回复

感谢楼主，这个word extractor能够比较好的支持中文。
原先我是使用nutch的word文本提取，但是相当大部分的中文word文档无法正确提取，到官方网站查看他们的解决方案，是这么说的“Document with 2-byte characters (that's how Chinese characters are probably stored) are not correctly handled by HWPF.”One more thing you need to consider: HWPF cannot handle "fast saved" Word files. If the documents you need to parse are "fast saved" this adds an extra level of complexity.

有点小问题，希望楼主有时间的时候帮忙大家修复一下，那就是有部分提取的文本前后有小方框的，我想应该是这些字符本不该被提取。

5 楼 sprite 2007-02-07 回复

希望楼主把表情符号关掉重新编辑一下让大家能欣赏到正确的代码

4 楼 fish922 2007-02-07 回复

写word的代码不对！

3 楼 minimu 2006-09-14 回复

收藏下

2 楼抛出异常的爱 2006-09-14 回复

代码里应该关了表情符号吧

1 楼 java虫 2006-09-14 回复

不错，收藏一下。论坛里这方面帖子不多

分享到：

《Velocity 模板使用指南》中文版 | js中innerHTML与innerText的用法与区别

2009-05-14 18:09
浏览 1097
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

POI 关于对 ms word的读写代码

POI 关于对 ms word的读写代码

评论

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

POI 关于对 ms word的读写代码

POI 关于对 ms word的读写代码

评论

评论

发表评论

相关推荐

改变IT世界的11大Apache开源技术

批量替換

一篇关于web.xml配置的详细说明

打印web页面的指定区域

jspSmartUpload 学习

统计访问量

视频总结-servlet高级开发

log4j最佳配置备份

jsp 生成 图片验证码

SSH协议

单点登录

CAS构建和实现单点登录解决方案

单点登录解决方案

单点登陆的概念

单点登录系统SSO原理

单点登录sso的定义

JSP获取真实IP地址

JFreeChart API说明

正则表达式在JAVA中的应用

JFreeChart 中的数据源

最近访客更多访客>>

jsp 生成图片验证码