`
wanjianfei
  • 浏览: 306137 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

POI 关于对 ms word的读写代码

    博客分类:
  • J2EE
阅读更多

POI 关于对 ms word的读写代码

关键字: poi

read word:
Java代码 复制代码
  1. publicclassWordExtractor{
  2. publicWordExtractor(){
  3. }
  4. publicStringextractText(InputStreamin)throwsIOException{
  5. ArrayListtext=newArrayList();
  6. POIFSFileSystemfsys=newPOIFSFileSystem(in);
  7. DocumentEntryheaderProps=(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
  8. DocumentInputStreamdin=fsys.createDocumentInputStream("WordDocument");
  9. byte[]header=newbyte[headerProps.getSize()];
  10. din.read(header);
  11. din.close();
  12. //Prendeleinformazionidall'headerdeldocumento
  13. intinfo=LittleEndian.getShort(header,0xa);
  14. booleanuseTable1=(info&0x200)!=0;
  15. //booleanuseTable1=true;
  16. //Prendeinformazionidallapiecetable
  17. intcomplexOffset=LittleEndian.getInt(header,0x1a2);
  18. //intcomplexOffset=LittleEndian.getInt(header);
  19. StringtableName=null;
  20. if(useTable1){
  21. tableName="1Table";
  22. }else{
  23. tableName="0Table";
  24. }
  25. DocumentEntrytable=(DocumentEntry)fsys.getRoot().getEntry(tableName);
  26. byte[]tableStream=newbyte[table.getSize()];
  27. din=fsys.createDocumentInputStream(tableName);
  28. din.read(tableStream);
  29. din.close();
  30. din=null;
  31. fsys=null;
  32. table=null;
  33. headerProps=null;
  34. intmultiple=findText(tableStream,complexOffset,text);
  35. StringBuffersb=newStringBuffer();
  36. intsize=text.size();
  37. tableStream=null;
  38. for(intx=0;x<size;x++){
  39. WordTextPiecenextPiece=(WordTextPiece)text.get(x);
  40. intstart=nextPiece.getStart();
  41. intlength=nextPiece.getLength();
  42. booleanunicode=nextPiece.usesUnicode();
  43. StringtoStr=null;
  44. if(unicode){
  45. toStr=newString(header,start,length*multiple,"UTF-16LE");
  46. }else{
  47. toStr=newString(header,start,length,"ISO-8859-1");
  48. }
  49. sb.append(toStr).append("");
  50. }
  51. returnsb.toString();
  52. }
  53. privatestaticintfindText(byte[]tableStream,intcomplexOffset,ArrayListtext)
  54. throwsIOException{
  55. //actualtext
  56. intpos=complexOffset;
  57. intmultiple=2;
  58. //skipsthroughtheprmsbeforewereachthepiecetable.Thesecontaindata
  59. //foractualfastsavedfiles
  60. while(tableStream[pos]==1){
  61. pos++;
  62. intskip=LittleEndian.getShort(tableStream,pos);
  63. pos+=2+skip;
  64. }
  65. if(tableStream[pos]!=2){
  66. thrownewIOException("corruptedWordfile");
  67. }else{
  68. //parseoutthetextpieces
  69. intpieceTableSize=LittleEndian.getInt(tableStream,++pos);
  70. pos+=4;
  71. intpieces=(pieceTableSize-4)/12;
  72. for(intx=0;x<pieces;x++){
  73. intfilePos=
  74. LittleEndian.getInt(tableStream,pos+((pieces+1)*4)+(x*8)+2);
  75. booleanunicode=false;
  76. if((filePos&0x40000000)==0){
  77. unicode=true;
  78. }else{
  79. unicode=false;
  80. multiple=1;
  81. filePos&=~(0x40000000);//givesmeFCindocstream
  82. filePos/=2;
  83. }
  84. inttotLength=
  85. LittleEndian.getInt(tableStream,pos+(x+1)*4)
  86. -LittleEndian.getInt(tableStream,pos+(x*4));
  87. WordTextPiecepiece=newWordTextPiece(filePos,totLength,unicode);
  88. text.add(piece);
  89. }
  90. }
  91. returnmultiple;
  92. }
  93. publicstaticvoidmain(String[]args){
  94. WordExtractorw=newWordExtractor();
  95. POIFSFileSystemps=newPOIFSFileSystem();
  96. try{
  97. Filefile=newFile("C:\\test.doc");
  98. InputStreamin=newFileInputStream(file);
  99. Strings=w.extractText(in);
  100. System.out.println(s);
  101. }catch(Exceptione){
  102. e.printStackTrace();
  103. }
  104. }
  105. }
  106. classWordTextPiece{
  107. privateint_fcStart;
  108. privateboolean_usesUnicode;
  109. privateint_length;
  110. publicWordTextPiece(intstart,intlength,booleanunicode){
  111. _usesUnicode=unicode;
  112. _length=length;
  113. _fcStart=start;
  114. }
  115. publicbooleanusesUnicode(){
  116. return_usesUnicode;
  117. }
  118. publicintgetStart(){
  119. return_fcStart;
  120. }
  121. publicintgetLength(){
  122. return_length;
  123. }
  124. }
public class WordExtractor {
	public WordExtractor() {
	}

	public String extractText(InputStream in) throws IOException {
		ArrayList text = new ArrayList();
		POIFSFileSystem fsys = new POIFSFileSystem(in);

		DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
		DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
		byte[] header = new byte[headerProps.getSize()];

		din.read(header);
		din.close();
		// Prende le informazioni dall'header del documento
		int info = LittleEndian.getShort(header, 0xa);

		boolean useTable1 = (info & 0x200) != 0;

		//boolean useTable1 = true;
		
		// Prende informazioni dalla piece table
		int complexOffset = LittleEndian.getInt(header, 0x1a2);
		//int complexOffset = LittleEndian.getInt(header);
		
		String tableName = null;
		if (useTable1) {
			tableName = "1Table";
		} else {
			tableName = "0Table";
		}

		DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
		byte[] tableStream = new byte[table.getSize()];

		din = fsys.createDocumentInputStream(tableName);

		din.read(tableStream);
		din.close();

		din = null;
		fsys = null;
		table = null;
		headerProps = null;

		int multiple = findText(tableStream, complexOffset, text);

		StringBuffer sb = new StringBuffer();
		int size = text.size();
		tableStream = null;

		for (int x = 0; x < size; x++) {
			
			WordTextPiece nextPiece = (WordTextPiece) text.get(x);
			int start = nextPiece.getStart();
			int length = nextPiece.getLength();

			boolean unicode = nextPiece.usesUnicode();
			String toStr = null;
			if (unicode) {
				toStr = new String(header, start, length * multiple, "UTF-16LE");
			} else {
				toStr = new String(header, start, length, "ISO-8859-1");
			}
			sb.append(toStr).append(" ");

		}
		return sb.toString();
	}

	private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
		throws IOException {
		//actual text
		int pos = complexOffset;
		int multiple = 2;
		//skips through the prms before we reach the piece table. These contain	data
		//for actual fast saved files
		while (tableStream[pos] == 1) {
			pos++;
			int skip = LittleEndian.getShort(tableStream, pos);
			pos += 2 + skip;
		}
		if (tableStream[pos] != 2) {
			throw new IOException("corrupted Word file");
		} else {
			//parse out the text pieces
			int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
			pos += 4;
			int pieces = (pieceTableSize - 4) / 12;
			for (int x = 0; x < pieces; x++) {
				int filePos =
					LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);
				boolean unicode = false;
				if ((filePos & 0x40000000) == 0) {
					unicode = true;
				} else {
					unicode = false;
					multiple = 1;
					filePos &= ~(0x40000000); //gives me FC in doc stream
					filePos /= 2;
				}
				int totLength =
					LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
						- LittleEndian.getInt(tableStream, pos + (x * 4));

				WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
				text.add(piece);

			}

		}
		return multiple;
	}
	public static void main(String[] args){
		WordExtractor w  = new WordExtractor();
		POIFSFileSystem ps = new POIFSFileSystem();
		try{
			
			File file = new File("C:\\test.doc");
			
			InputStream in = new FileInputStream(file);
			String s = w.extractText(in);
			System.out.println(s);
	
			
		}catch(Exception e){
			e.printStackTrace();
		}
				
	}

}
class WordTextPiece {
	private int _fcStart;
	private boolean _usesUnicode;
	private int _length;

	public WordTextPiece(int start, int length, boolean unicode) {
		_usesUnicode = unicode;
		_length = length;
		_fcStart = start;
	}
	public boolean usesUnicode() {
		return _usesUnicode;
	}

	public int getStart() {
		return _fcStart;
	}
	public int getLength() {
		return _length;
	}

}


write word

Java代码 复制代码
  1. publicbooleanwriteWordFile(Stringpath,Stringcontent){
  2. booleanw=false;
  3. try{
  4. //byteb[]=content.getBytes("ISO-8859-1");
  5. byteb[]=content.getBytes();
  6. ByteArrayInputStreambais=newByteArrayInputStream(b);
  7. POIFSFileSystemfs=newPOIFSFileSystem();
  8. DirectoryEntrydirectory=fs.getRoot();
  9. DocumentEntryde=directory.createDocument("WordDocument",bais);
  10. FileOutputStreamostream=newFileOutputStream(path);
  11. fs.writeFilesystem(ostream);
  12. bais.close();
  13. ostream.close();
  14. }catch(IOExceptione){
  15. e.printStackTrace();
  16. }
  17. returnw;
  18. }
	public boolean writeWordFile(String path, String content) {
		boolean w = false;
		try {
	
		//	byte b[] = content.getBytes("ISO-8859-1");
			byte b[] = content.getBytes();
			
			ByteArrayInputStream bais = new ByteArrayInputStream(b);

			POIFSFileSystem fs = new POIFSFileSystem();
			DirectoryEntry directory = fs.getRoot();

			DocumentEntry de = directory.createDocument("WordDocument", bais);

			FileOutputStream ostream = new FileOutputStream(path);

			fs.writeFilesystem(ostream);
			
			bais.close();
			ostream.close();

		} catch (IOException e) {
			e.printStackTrace();
		}

		return w;
	}

写操作的代码还是有些问题:打开WORD时提示要选择字符类型
希望能改进!


当然这几个jar是少不了的
poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar
评论
11 楼 cleanboxer 2007-06-18 回复
开发poi word 的那个哥们老早就不干了,好像就职于商业的公司,apache还招募人参与呢,2006的事, poi处理word 太弱,还得用vs6.0 c++, java的开源的那个word有更详细的文档,不过没研究过,处理excel还算凑合,
不过也问题比较多,不能区分单元格内内容的格式,或错误判断
10 楼 jlusdy 2007-06-15 回复
问个问题,写work java哪个开源包比较好
我看POIword支持不太够啊
9 楼 andyandyandy 2007-04-02 回复
word的少见,收了
8 楼 strongkill 2007-04-02 回复
http://jakarta.apache.org/poi/index.html
7 楼 dy.f 2007-03-15 回复
poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar

能提供这几个包的下载吗?
6 楼 transist 2007-03-06 回复
感谢楼主,这个word extractor能够比较好的支持中文。
原先我是使用nutch的word文本提取,但是相当大部分的中文word文档无法正确提取,到官方网站查看他们的解决方案,是这么说的“Document with 2-byte characters (that's how Chinese characters are probably stored) are not correctly handled by HWPF.”One more thing you need to consider: HWPF cannot handle "fast saved" Word files. If the documents you need to parse are "fast saved" this adds an extra level of complexity.


有点小问题,希望楼主有时间的时候帮忙大家修复一下,那就是有部分提取的文本前后有小方框的,我想应该是这些字符本不该被提取。
5 楼 sprite 2007-02-07 回复
希望楼主把表情符号关掉 重新编辑一下 让大家能欣赏到正确的代码
4 楼 fish922 2007-02-07 回复
word的代码不对!
3 楼 minimu 2006-09-14 回复
收藏下
2 楼 抛出异常的爱 2006-09-14 回复
代码里应该关了表情符号吧
1 楼 java虫 2006-09-14 回复
不错,收藏一下。论坛里这方面帖子不多
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics