标签:
import os import shutil import re import sys src = "S:\\date\\before" #转换之前的文件夹,支持包含文件夹 ddn = "S:\\date\\after" #转换之后的文件夹 def ReadFileandSave(filepath): file = open(filepath) i = 0 while 1: lines = file.readlines(1000) if not lines: break for line in lines: i = i + 1 if i > 100: break num = line.find("文件序号") print num #文件编码类型判断 def obtainFileType(filepath): import chardet tt = open(filepath, ‘rb‘) ff = tt.readline() #这里试着换成read(5)也可以,但是换成readlines()后报错 enc = chardet.detect(ff) tt.close() return enc[‘encoding‘] #返回文件类型 ##################################### def search(src, handler): global ddn list = os.listdir(src) for f in list: cf = src + ‘\\‘ + f #print cf #ReadFileandSave(cf) if os.path.isdir(cf): #dir continue search search(cf, handler) else: handler(cf, ddn) #the function dealing with file ########################################################## def copy(sfn, ddn): global src ddn = ddn + os.path.dirname(sfn).replace(src, ‘‘) def copymain(): #main code of copy function fn = os.path.basename(sfn) print "Processing file name ", fn if -1 != fn.find(".txt"): sf = open(sfn, ‘r‘) # print "==sfn==", sfn s = sf.read() #读取文件所有内容 #print obtainFileType(sfn) #文件编码类型判断 try: #针对网页类型的文件的处理部分 if obtainFileType(sfn) == ‘GB2312‘: #gb2312类型的进行转换 #print ‘===GB2312===: ‘, sfn s = re.sub(‘charset=gb2312‘,‘charset=utf-8‘,s) s = s.decode(‘gbk‘) df = open(ddn + ‘\\‘ + "utf8_" + os.path.basename(sfn), ‘w‘) df.write(s.encode(‘utf-8‘)) df.flush() sf.close() df.close() print ddn + ‘\\‘ + "utf8_" + os.path.basename(sfn) elif obtainFileType(sfn) == ‘UTF-8-SIG‘: #utf-8类型直接复制 #print ‘utf-8: ‘,sfn sf.close() shutil.copy2(sfn,ddn) else: #不知道类型的按gb2312转换成utf-8类型 s = s.decode(‘gbk‘) df = open(ddn+‘\\‘ + os.path.basename(sfn),‘w‘) df.write(s.encode(‘utf-8‘)) df.flush() sf.close() df.close() #print ‘*** ‘,sfn except: sf.close # print sfn sys.exit() else: #非指定类型文件,直接复制 shutil.copy2(sfn, ddn) if os.path.exists(ddn): #目标文件夹不存在,就创建 copymain() #‘‘‘ else: #directory not exist,create it os.makedirs (ddn) copymain() #‘‘‘ if __name__ == "__main__": search(src, copy)
标签:
原文地址:http://www.cnblogs.com/hgonlywj/p/4842681.html