标签:
import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.File; import java.io.OutputStreamWriter; import java.util.HashSet; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.RichTextRun; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFShape; import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xslf.usermodel.XSLFTable; import org.apache.poi.xslf.usermodel.XSLFTableCell; import org.apache.poi.xslf.usermodel.XSLFTableRow; import org.apache.poi.xslf.usermodel.XSLFTextParagraph; import org.apache.poi.xslf.usermodel.XSLFTextRun; import org.apache.poi.xslf.usermodel.XSLFTextShape; public class read { private static InputStream is =null; public static void readFiles(String path){ File f = new File(path); File[] files = f.listFiles(); for(File everyfile : files){ StringBuilder sb = new StringBuilder(); if(everyfile.isDirectory()) continue; String Filename = everyfile.getName(); if(Filename.startsWith("~")) continue; if(!Filename.endsWith("doc")) continue; System.out.println(Filename); sb.append("###@@@").append(Filename.substring(0,Filename.lastIndexOf("."))).append("\n"); sb.append("----------------------").append("\n"); try { //输入文件流 is = new FileInputStream(everyfile); if(Filename.toLowerCase().endsWith("ppt")||Filename.toLowerCase().endsWith("pptm")){ try { XMLSlideShow pptx = new XMLSlideShow(is); is.close(); for(int x= 0 ; x< pptx.getSlides().length ; x++){ XSLFSlide slide = pptx.getSlides()[x]; if (slide.getShapes().length == 0) continue; String title = getTitle(slide); if(title != null) sb.append(title).append("\t").append("title##@@").append("\n"); for(XSLFShape shape : slide){ if(shape instanceof XSLFTextShape){ XSLFTextShape content = (XSLFTextShape)shape; for( XSLFTextParagraph ttp: content.getTextParagraphs()){ if(ttp.getText().equals(title)) continue; //用一个set统计到底有多少字体大小,如果只有一种字体大小,则直接添加paragraph HashSet<Float> sizeset = new HashSet<Float>(); for(XSLFTextRun tr : ttp.getTextRuns()){ if (tr.getText().trim().equals("")) continue; if(tr.getText().trim().equals(title)) continue; float size = (float) tr.getFontSize(); sizeset.add(size); } if(sizeset.size()!=1){ for(XSLFTextRun tr : ttp.getTextRuns()){ if (tr.getText().trim().equals("")) continue; if(tr.getText().trim().equals(title)) continue; String text = tr.getText(); float size = (float) tr.getFontSize(); sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n"); } }else{ sb.append(ttp.getText().trim().replaceAll("[\\n\\r]", " ")).append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n"); } } }else if(shape instanceof XSLFTable){ XSLFTable txShape = (XSLFTable)shape; for(XSLFTableRow row : txShape.getRows()){ for(XSLFTableCell cell: row.getCells()){ XSLFTextShape content = (XSLFTextShape)cell; for( XSLFTextParagraph ttp: content.getTextParagraphs()){ if(ttp.getText().equals(title)) continue; //用一个set统计到底有多少字体大小,如果只有一种字体大小,则直接添加paragraph HashSet<Float> sizeset = new HashSet<Float>(); for(XSLFTextRun tr : ttp.getTextRuns()){ if (tr.getText().trim().equals("")) continue; if(tr.getText().trim().equals(title)) continue; float size = (float) tr.getFontSize(); sizeset.add(size); } if(sizeset.size()!=1){ for(XSLFTextRun tr : ttp.getTextRuns()){ if (tr.getText().trim().equals("")) continue; if(tr.getText().trim().equals(title)) continue; String text = tr.getText(); float size = (float) tr.getFontSize(); sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n"); } }else{ sb.append(ttp.getText().trim().replaceAll("[\\n\\r]", " ")).append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n"); } } } } } } if(x!=pptx.getSlides().length-1) sb.append("----------------------").append("\n"); } } catch (IOException e) { e.printStackTrace(); } }else if(Filename.endsWith("ppt")){ try { SlideShow ss = new SlideShow(new HSLFSlideShow(is)); is.close(); for(int x = 0 ; x < ss.getSlides().length ; x ++){ Slide slide = ss.getSlides()[x]; if (slide.getShapes().length ==0) continue; String title = getTitle(slide); if(title != null) sb.append(title).append("\t").append("title##@@").append("\n"); for(TextRun tr : slide.getTextRuns()){ HashSet<Float> sizeset = new HashSet<Float>(); for(RichTextRun rtr : tr.getRichTextRuns()){ if (rtr.getText().trim().equals("")|| rtr.getText() ==null) continue; if(rtr.getText().trim().equals(title)) continue; sizeset.add((float)rtr.getFontSize()); } if(sizeset.size()!=1){ for(RichTextRun rtr : tr.getRichTextRuns()){ if (rtr.getText().trim().equals("") || rtr.getText() ==null) continue; if(rtr.getText().trim().equals(title)) continue; String text = rtr.getText(); float size = (float) rtr.getFontSize(); sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n"); } }else { for(RichTextRun rtr : tr.getRichTextRuns()){ if (rtr.getText().trim().equals("")|| rtr.getText() ==null) continue; if(rtr.getText().trim().equals(title)) continue; sb.append(rtr.getText().trim()).append(" "); } sb.append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n"); } } if(x!=ss.getSlides().length-1) sb.append("----------------------").append("\n"); } } catch (IOException e) { e.printStackTrace(); } }else if(Filename.endsWith("doc")){ try { HWPFDocument hwpf = new HWPFDocument(is); Range range = hwpf.getRange(); for (int x = 0; x < range.numSections(); x++) { Section s = range.getSection(x); for (int y = 0; y < s.numParagraphs(); y++) { Paragraph p = s.getParagraph(y); for (int z = 0; z < p.numCharacterRuns(); z++) { CharacterRun run = p.getCharacterRun(z); //字符串文本 String text = run.text().trim(); if(text ==null ||text == " "|| text=="") continue; sb.append(text.trim()).append("\t").append(run.getFontSize()).append("##@@").append("\n"); } } if (x != range.numSections()-1) sb.append("----------------------").append("\n"); } } catch (IOException e) { e.printStackTrace(); } } } catch (FileNotFoundException e) { e.printStackTrace(); } write(sb.toString()); } } public static String getTitle(XSLFSlide slide){ String title = null; if (slide.getTitle() != null && !slide.getTitle().trim().equals("")){ title= slide.getTitle().trim(); } return title; } public static String getTitle(Slide slide){ String title = null; if (slide.getTitle() != null && !slide.getTitle().trim().equals("")){ title= slide.getTitle().trim(); } return title; } static FileOutputStream fos =null; static OutputStreamWriter osw =null; static BufferedWriter bw =null; public static void write(String content){ File f = new File("ressss.csv"); try { fos = new FileOutputStream(f,true); osw = new OutputStreamWriter(fos,"utf-8"); bw = new BufferedWriter(osw); bw.write(content); bw.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ if(bw !=null){ try { bw.close(); } catch (IOException e) { e.printStackTrace(); } bw =null; } if(osw !=null){ try { osw.close(); } catch (IOException e) { e.printStackTrace(); } osw =null; } if(fos !=null){ try { fos.close(); } catch (IOException e) { e.printStackTrace(); } fos =null; } } } public static void main(String[] args) throws Exception { readFiles("C:\\Users\\ooon\\Desktop\\DKM_data\\DKM_data"); } }
标签:
原文地址:http://www.cnblogs.com/ooon/p/4828007.html