码迷,mamicode.com
首页 > 编程语言 > 详细

python_读取 doc,docx,pdf

时间:2017-04-01 01:27:23      阅读:845      评论:0      收藏:0      [点我收藏+]

标签:win32   graph   []   layout   pre   max   color   cli   docx   

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import docx

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

from win32com import client
import sys
reload(sys)
sys.setdefaultencoding(gb2312)

def readDocx(docxPath):
    fullText = []
    doc = docx.Document(docxPath)
    paras = doc.paragraphs
    for p in paras:
        fullText.append(p.text.strip())
    return \n.join(fullText)
def readPdf(pdfPath):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = utf-8
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(pdfPath, rb)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
def readDoc(docPath):
    fullText = []
    word = client.Dispatch(Word.Application)    
    # 打开一个已存在的文件
    doc = word.Documents.Open(docPath)
    #print doc.Content
    #print text
    doc.SaveAs(c:/temp.txt, 2)
    # 关闭
    doc.Close()
    word.Quit()
    f=open(rc:/temp.txt,r)  
    for line in f.readlines(): 
        #f len(line)!=line.count(‘\n‘):
        fullText.append(line.decode(gbk).strip())
    f.close()
    return \n.join(fullText)
if __name__ == __main__:
    #docxValue=readDocx(‘d:/1.docx‘)
    #print docxValue
    #pdfValue = readPdf(‘d:/3.pdf‘)
    #print pdfValue
    docValue = readDoc(d:/2.doc)
    print docValue

 

python_读取 doc,docx,pdf

标签:win32   graph   []   layout   pre   max   color   cli   docx   

原文地址:http://www.cnblogs.com/zy900406/p/6654017.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!