标签:xhtml jsoup 表达式 share stack com builder parser span
需要的jar包有:有一些是依赖包,可以使用maven下载
doc文件转换为html文件
package com.gsww.sxzz.controller.service; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.PictureType; import org.jsoup.Jsoup; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.util.List; /** * Created by Carey on 15-2-2. */ public class docTohtml { public static void main(String argv[]) { try { convert2Html("D:\\b.doc","D:\\1.html"); } catch (Exception e) { e.printStackTrace(); } } //输出html文件 public static void writeFile(String content, String path) { FileOutputStream fos = null; BufferedWriter bw = null; org.jsoup.nodes.Document doc = Jsoup.parse(content); String styleOld=doc.getElementsByTag("style").html(); //统一字体格式为宋体 styleOld=styleOld.replaceAll("font-family:.+(?=;\\b)", "font-family:SimSun"); doc.getElementsByTag("head").empty(); doc.getElementsByTag("head").append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></meta>"); doc.getElementsByTag("head").append(" <style type=\"text/css\"></style>"); doc.getElementsByTag("style").append(styleOld); /*正则表达式查询字体内容:font-family:.+(?=;\b)*/ System.out.println(content); content=doc.html(); content=content.replace("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">", "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></meta>"); try { File file = new File(path); fos = new FileOutputStream(file); bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8")); bw.write(content); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { } } } //word 转 html public static void convert2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));//WordToHtmlUtils.loadDoc(new FileInputStream(inputFile)); //兼容2007 以上版本 // XSSFWorkbook xssfwork=new XSSFWorkbook(new FileInputStream(fileName)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager( new PicturesManager() { public String savePicture( byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches ) { return "test/"+suggestedName; } } ); wordToHtmlConverter.processDocument(wordDocument); //save pictures List pics=wordDocument.getPicturesTable().getAllPictures(); if(pics!=null){ for(int i=0;i<pics.size();i++){ Picture pic = (Picture)pics.get(i); System.out.println(); try { pic.writeImageContent(new FileOutputStream("D:/test/" + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "HTML"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFile); } }
遇到的问题,当doc转换为html时不会将图像的线条给转换过来。只有在table表格中才可以转换为span标签。如果要作下滑线,可以放一个table的单元格只设定下边框就可以完美转换为html了。
将html转换为pdf
package com.gsww.sxzz.controller.service; import com.lowagie.text.pdf.BaseFont; import org.xhtmlrenderer.pdf.ITextFontResolver; import org.xhtmlrenderer.pdf.ITextRenderer; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStream; /** * Created by Carey on 15-2-2. */ public class htmlToPdf { public boolean convertHtmlToPdf(String inputFile, String outputFile) { try { OutputStream os = new FileOutputStream(outputFile); ITextRenderer renderer = new ITextRenderer(); String url = new File(inputFile).toURI().toURL().toString(); renderer.setDocument(url); // 解决中文支持问题 ITextFontResolver fontResolver = renderer.getFontResolver(); /*fontResolver.addFont("C:\\Windows\\Fonts\\simsunb.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED); *///宋体文件的相对路径 fontResolver.addFont("C:\\Windows\\Fonts\\simsun.ttc", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED); renderer.getSharedContext().setBaseURL("file:/D:/"); renderer.layout(); renderer.createPDF(os); os.flush(); os.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return true; } public static void main(String [] args){ htmlToPdf html2Pdf =new htmlToPdf(); try { html2Pdf.convertHtmlToPdf("D:\\1.html","D:\\index.pdf"); } catch (Exception e) { e.printStackTrace(); } } }
标签:xhtml jsoup 表达式 share stack com builder parser span
原文地址:http://www.cnblogs.com/gynbk/p/7230849.html