标签:format size fileinput ttext sbo rgs 组件 extractor exist
poi读取段落demopackage com.ocr.word.poi;
import java.io.FileInputStream;
import java.util.List;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class ParagraphReader {
public static void main(String[] args) {
try {
FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
List<XWPFParagraph> paragraphList = xdoc.getParagraphs();
for (XWPFParagraph paragraph : paragraphList) {
System.out.println(paragraph.getText());
System.out.println(paragraph.getAlignment());
System.out.print(paragraph.getRuns().size());
System.out.println(paragraph.getStyle());
// Returns numbering format for this paragraph, eg bullet or lowerLetter.
System.out.println(paragraph.getNumFmt());
System.out.println(paragraph.getAlignment());
System.out.println(paragraph.isWordWrapped());
System.out.println("********************************************************************");
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
package com.ocr.word.poi;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
public class TableReader {
public static void main(String[] args) {
try {
FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
Iterator bodyElementIterator = xdoc.getBodyElementsIterator();
while (bodyElementIterator.hasNext()) {
IBodyElement element = (IBodyElement) bodyElementIterator.next();
if ("TABLE".equalsIgnoreCase(element.getElementType().name())) {
List<XWPFTable> tableList = element.getBody().getTables();
for (XWPFTable table : tableList) {
System.out.println("Total Number of Rows of Table:" + table.getNumberOfRows());
for (int i = 0; i < table.getRows().size(); i++) {
for (int j = 0; j < table.getRow(i).getTableCells().size(); j++) {
System.out.println(table.getRow(i).getCell(j).getText());
}
}
}
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
package com.ocr.word.poi;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
public class StyleReader {
public static void main(String[] args) {
try {
FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
List<XWPFParagraph> paragraphList = xdoc.getParagraphs();
for (XWPFParagraph paragraph : paragraphList) {
for (XWPFRun rn : paragraph.getRuns()) {
System.out.println(rn.isBold());
System.out.println(rn.isHighlighted());
System.out.println(rn.isCapitalized());
System.out.println(rn.getFontSize());
}
System.out.println("********************************************************************");
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
package com.ocr.word;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.springframework.util.FileCopyUtils;
public class Readword {
// 全?角半?角符号转化之间的间隔
public static final char DBC_SBC_STEP = 65248;
private final static String RESULT_DATA = "data";
private final static String RESULT_ANGLE = "angle";
private final static String RESULT_WORDS_INFO = "prism_wordsInfo";
/**
* 字符串转unicode
*
* @param str
* @return
*/
public static String stringToUnicode(String str) {
StringBuffer sb = new StringBuffer();
char[] c = str.toCharArray();
for (int i = 0; i < c.length; i++) {
sb.append("\\u" + Integer.toHexString(c[i]));
}
return sb.toString();
}
/**
* 获取段落内容
*
* @param paragraph
*/
private static void getParagraphText(XWPFParagraph paragraph) {
// 获取段落中所有内容
List<XWPFRun> runs = paragraph.getRuns();
if (runs.size() == 0) {
System.out.println("按了回车(新段落)");
return;
}
StringBuffer runText = new StringBuffer();
for (XWPFRun run : runs) {
runText.append(run.text());
}
if (runText.length() > 0) {
runText.append(",对齐方式:").append(paragraph.getAlignment().name());
System.out.println(runText);
}
}
/**
* 获取表格内容
*
* @param table
*/
private static void getTabelText(XWPFTable table) {
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
// 简单获取内容(简单方式是不能获取字体对齐方式的)
// System.out.println(cell.getText());
// 一个单元格可以理解为一个word文档,单元格里也可以加段落与表格
List<XWPFParagraph> paragraphs = cell.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
getParagraphText(paragraph);
}
}
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String filePath="D:\\word\\aaa.doc";
StringBuffer wordMapStr=new StringBuffer();
Map wordMap = new LinkedHashMap();//创建一个map对象存放word中的内容
try {
if(filePath.endsWith(".doc")){ ///判断文件格式
InputStream fis = new FileInputStream(new File(filePath));
WordExtractor wordExtractor = new WordExtractor(fis);//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
int i=1;
for(String words : wordExtractor.getParagraphText()){//获取段落内容
System.out.println(words);//.replaceAll("", "")
wordMap.put("DOC文档,第("+i+")段内容",words);
wordMapStr.append(words.replaceAll("", "")+"\n");
i++;
}
fis.close();
}
if(filePath.endsWith(".docx")){
File uFile = new File("tempFile.docx");//创建一个临时文件
if(!uFile.exists()){
uFile.createNewFile();
}
FileCopyUtils.copy(new File(filePath), uFile);//复制文件内容
OPCPackage opcPackage = POIXMLDocument.openPackage("tempFile.docx");//包含所有POI OOXML文档类的通用功能,打开一个文件包。
XWPFDocument document = new XWPFDocument(opcPackage);//使用XWPF组件XWPFDocument类获取文档内容
List<XWPFParagraph> paras = document.getParagraphs();
int i=1;
for(XWPFParagraph paragraph : paras){
String words = paragraph.getText();
System.out.println(words);
wordMap.put("DOCX文档,第("+i+")段内容",words+"\n");
wordMapStr.append(words);
i++;
}
List<XWPFTable> it = document.getTables();
it.forEach(item->{
wordMapStr.append(item.getText());
});
uFile.delete();
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("-->"+wordMapStr.toString());
System.out.println(wordMap);
}
}
标签:format size fileinput ttext sbo rgs 组件 extractor exist
原文地址:https://blog.51cto.com/4534309/2477847