标签:
本文通过开源pdfbox和poi进行处理多种文件格式的文本读入
1.需要的jar的maven坐标:
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.2</version> </dependency> <!-- ppt,xls,docx,pptx,xlsx--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>dom4j</groupId> <artifactId>dom4j</artifactId> <version>1.6.1</version> </dependency>
处理多种文件格式,详情见代码:
1 package cn.lcg.utils; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.IOException; 6 import java.text.NumberFormat; 7 import java.util.List; 8 9 import org.apache.pdfbox.pdmodel.PDDocument; 10 import org.apache.pdfbox.text.PDFTextStripper; 11 import org.apache.poi.hslf.extractor.PowerPointExtractor; 12 import org.apache.poi.hssf.usermodel.HSSFCell; 13 import org.apache.poi.hssf.usermodel.HSSFRow; 14 import org.apache.poi.hssf.usermodel.HSSFSheet; 15 import org.apache.poi.hssf.usermodel.HSSFWorkbook; 16 import org.apache.poi.hwpf.HWPFDocument; 17 import org.apache.poi.hwpf.usermodel.Range; 18 import org.apache.poi.ss.usermodel.Cell; 19 import org.apache.poi.xslf.usermodel.XMLSlideShow; 20 import org.apache.poi.xslf.usermodel.XSLFSlide; 21 import org.apache.poi.xslf.usermodel.XSLFSlideShow; 22 import org.apache.poi.xssf.usermodel.XSSFCell; 23 import org.apache.poi.xssf.usermodel.XSSFRow; 24 import org.apache.poi.xssf.usermodel.XSSFSheet; 25 import org.apache.poi.xssf.usermodel.XSSFWorkbook; 26 import org.apache.poi.xwpf.extractor.XWPFWordExtractor; 27 import org.apache.poi.xwpf.usermodel.XWPFDocument; 28 import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; 29 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; 30 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; 31 import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; 32 import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; 33 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; 34 35 /** 36 * 37 * @author yujian 38 * @date 2016年10月12日 39 * @version 0.0.1 40 */ 41 public class FileFormat { 42 /** 43 * 用来读取doc文件的方法 44 * @param filePath 45 * @return 46 * @throws Exception 47 */ 48 public static String getTextFromDoc(String filePath) throws Exception{ 49 StringBuilder sb = new StringBuilder(); 50 FileInputStream fis = new FileInputStream(new File(filePath)); 51 HWPFDocument doc = new HWPFDocument(fis); 52 Range rang = doc.getRange(); 53 sb.append(rang.text()); 54 fis.close(); 55 return sb.toString(); 56 57 } 58 /** 59 * 用来读取docx文件 60 * @param filePath 61 * @return 62 * @throws IOException 63 * @throws Exception 64 */ 65 @SuppressWarnings("resource") 66 public static String getTextFromDocx(String filePath) throws IOException { 67 FileInputStream in = new FileInputStream(filePath); 68 XWPFDocument doc = new XWPFDocument(in); 69 XWPFWordExtractor extractor = new XWPFWordExtractor(doc); 70 String text = extractor.getText(); 71 in.close(); 72 return text; 73 } 74 /** 75 * 用来读取pdf文件 76 * @param filePath 77 * @return 78 * @throws IOException 79 */ 80 public static String getTextFromPDF(String filePath) throws IOException{ 81 File input = new File(filePath); 82 PDDocument pd = PDDocument.load(input); 83 PDFTextStripper stripper = new PDFTextStripper(); 84 return stripper.getText(pd); 85 } 86 /** 87 * 用来读取ppt文件 88 * @param filePath 89 * @return 90 * @throws IOException 91 */ 92 public static String getTextFromPPT( String filePath) throws IOException{ 93 FileInputStream in = new FileInputStream(filePath); 94 PowerPointExtractor extractor = new PowerPointExtractor(in); 95 String content = extractor.getText(); 96 extractor.close(); 97 return content; 98 } 99 /** 100 * 用来读取pptx文件 101 * @param filePath 102 * @return 103 * @throws IOException 104 */ 105 public static String getTextFromPPTX( String filePath) throws IOException{ 106 String resultString = null; 107 StringBuilder sb = new StringBuilder(); 108 FileInputStream in = new FileInputStream(filePath); 109 try { 110 XMLSlideShow xmlSlideShow = new XMLSlideShow(in); 111 List<XSLFSlide> slides = xmlSlideShow.getSlides(); 112 for(XSLFSlide slide:slides){ 113 CTSlide rawSlide = slide.getXmlObject(); 114 CTGroupShape gs = rawSlide.getCSld().getSpTree(); 115 CTShape[] shapes = gs.getSpArray(); 116 for(CTShape shape:shapes){ 117 CTTextBody tb = shape.getTxBody(); 118 if(null==tb){ 119 continue; 120 } 121 CTTextParagraph[] paras = tb.getPArray(); 122 for(CTTextParagraph textParagraph:paras){ 123 CTRegularTextRun[] textRuns = textParagraph.getRArray(); 124 for(CTRegularTextRun textRun:textRuns){ 125 sb.append(textRun.getT()); 126 } 127 } 128 } 129 } 130 resultString = sb.toString(); 131 xmlSlideShow.close(); 132 } catch (Exception e) { 133 e.printStackTrace(); 134 } 135 return resultString; 136 } 137 /** 138 * 用来读取xls 139 * @param filePath 140 * @return 141 * @throws IOException 142 */ 143 public static String getTextFromxls(String filePath) throws IOException{ 144 FileInputStream in = new FileInputStream(filePath); 145 StringBuilder content = new StringBuilder(); 146 HSSFWorkbook workbook = new HSSFWorkbook(in); 147 for(int sheetIndex=0;sheetIndex<workbook.getNumberOfSheets();sheetIndex++){ 148 HSSFSheet sheet = workbook.getSheetAt(sheetIndex); 149 for(int rowIndex=0;rowIndex<=sheet.getLastRowNum();rowIndex++){ 150 HSSFRow row = sheet.getRow(rowIndex); 151 if(row==null){ 152 continue; 153 } 154 for(int cellnum=0;cellnum<row.getLastCellNum();cellnum++){ 155 HSSFCell cell = row.getCell(cellnum); 156 if(cell!=null){ 157 content.append(cell.getRichStringCellValue().getString()+" "); 158 } 159 160 } 161 } 162 163 } 164 workbook.close(); 165 return content.toString(); 166 167 } 168 /** 169 * 用来读取xlsx文件 170 * @param filePath 171 * @return 172 * @throws IOException 173 */ 174 public static String getTextFromxlsx(String filePath) throws IOException{ 175 StringBuilder content = new StringBuilder(); 176 XSSFWorkbook workbook = new XSSFWorkbook(filePath); 177 for(int sheet=0;sheet<workbook.getNumberOfSheets();sheet++){ 178 if(null!=workbook.getSheetAt(sheet)){ 179 XSSFSheet aSheet =workbook.getSheetAt(sheet); 180 for(int row=0;row<=aSheet.getLastRowNum();row++){ 181 if(null!=aSheet.getRow(row)){ 182 XSSFRow aRow = aSheet.getRow(row); 183 for(int cell=0;cell<aRow.getLastCellNum();cell++){ 184 if(null!=aRow.getCell(cell)){ 185 XSSFCell aCell = aRow.getCell(cell); 186 if(convertCell(aCell).length()>0){ 187 content.append(convertCell(aCell)); 188 } 189 } 190 content.append(" "); 191 } 192 } 193 } 194 } 195 } 196 workbook.close(); 197 return content.toString(); 198 199 } 200 201 private static String convertCell(Cell cell){ 202 NumberFormat formater = NumberFormat.getInstance(); 203 formater.setGroupingUsed(false); 204 String cellValue=""; 205 if(cell==null){ 206 return cellValue; 207 } 208 209 switch(cell.getCellType()){ 210 case HSSFCell.CELL_TYPE_NUMERIC: 211 cellValue = formater.format(cell.getNumericCellValue()); 212 break; 213 case HSSFCell.CELL_TYPE_STRING: 214 cellValue = cell.getStringCellValue(); 215 break; 216 case HSSFCell.CELL_TYPE_BLANK: 217 cellValue = cell.getStringCellValue(); 218 break; 219 case HSSFCell.CELL_TYPE_BOOLEAN: 220 cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString(); 221 break; 222 case HSSFCell.CELL_TYPE_ERROR: 223 cellValue = String.valueOf(cell.getErrorCellValue()); 224 break; 225 default:cellValue=""; 226 } 227 return cellValue.trim(); 228 } 229 }
解释的话就没有那么多时间,这些代码在我的项目中完全正确,所以你们可以放心使用。
用java读取多种文件格式的文件(pdf,pptx,ppt,doc,docx..)
标签:
原文地址:http://www.cnblogs.com/yuliman/p/5975953.html