# coding=utf-8 # author:Jeffrey Ma # version:0.1 # build 2 # created on:2015年3月31日 # description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码 # 2. 支持指定目录下所有的文件的转换,包括子目录中的文件 # 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换 # 4. 支持只转换指定扩展名的编码 # 5. 支持多线程转换和控制台输出 # 6. 支持控制台显示线程池的状态 # 7. 支持日志记录 # usage: python gbk2utf8.py -s [文件路径] # args : 文件的绝对路径 # notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。 from __future__ import division import sys import os import getopt import logging import logging.config import Queue import threadpool import threading from threading import Thread from multiprocessing.dummy import Pool as ThreadPool import chardet import curses import time import locale locale.setlocale(locale.LC_ALL, "") global logger global stdscr global pool stdscr = curses.initscr() def GBK2UTF8(filename): threadName = threading.currentThread().getName() f = open(filename, ‘rb‘) s = f.read() f.close() encodingName = chardet.detect(s)[‘encoding‘] str = ""; if (encodingName.startswith(‘GB‘)): # GBK码,需要转换 try: gbkContent = s.decode(encodingName) utf8Content = gbkContent.encode(‘utf-8‘) f = open(filename, ‘w‘) f.write(utf8Content) f.close() except UnicodeDecodeError: str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName) # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName)) # logger.error(‘%s: decoe error %s‘ % (threadName, UnicodeDecodeError.reason)) pass str = "%s: %s, %s 转换done" % (threadName, filename, encodingName) else: # 已经是UTF-8不需要转换 str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName) return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str} def initLogger(): global logger # 日志初始化 LOG_FILENAME = ‘logging.conf‘ logging.config.fileConfig(LOG_FILENAME) logger = logging.getLogger("GBK2UTF8") # 测试代码 # logger.debug("debug message") # logger.info("info message") # logger.warn("warn message") # logger.error("error message") # logger.critical("critical message") def main(): initLogger() shortargs = ‘s:d‘ longargs = [‘src=‘, ‘dest‘] try: opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" # usage() print "Usage: python gbk2utf8.py -s [file full path]" return # sys.exit(2) srcPath = None destPath = None for o, a in opts: if o in ("-s", "--src"): srcPath = a elif o in ("-d", "--dest"): destPath = a else: assert False, "unhandled option" if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)): doWork(srcPath) def doWork(sPath): # Make the Pool of workers global pool pool = threadpool.ThreadPool(10) extFilters = [‘xml‘, ‘java‘, ‘js‘, ‘txt‘, ‘css‘, ‘php‘, ‘html‘, ‘htm‘, ‘tpl‘] i = 0 arrFiles = [] for root, dirs, files in os.walk(sPath): for file in files: # print root # print file i = i+1 sFilePath = root + os.sep + file extension = os.path.splitext(sFilePath)[1][1:] if (extension in extFilters): arrFiles.append(sFilePath) else: logger.info(‘Skipping %s‘ % sFilePath) print ‘waiting...job‘ curses.noecho() curses.cbreak() requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result) [pool.putRequest(req) for req in requests] #close the pool and wait for the work to finish pool.wait() curses.nocbreak() curses.echo() curses.endwin() print ‘end job‘ def print_result(request, result): try: idx = 0 for t in pool.workers: idx = idx+1 if(t.getName() == result["tName"]): break if idx > 0: y, x = stdscr.getmaxyx() # stdscr.deleteln() text = result["result"] textLen = len(text) text = text.ljust(x) stdscr.addstr(idx, 0, text) stdscr.refresh() logger.info(text) except curses.error: pass if __name__ == ‘__main__‘: main()
# coding=utf-8 # author:Jeffrey Ma # version:0.1 # build 2 # created on:2015年3月31日 # description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码 # 2. 支持指定目录下所有的文件的转换,包括子目录中的文件 # 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换 # 4. 支持只转换指定扩展名的编码 # 5. 支持多线程转换和控制台输出 # 6. 支持控制台显示线程池的状态 # 7. 支持日志记录 # usage: python gbk2utf8.py -s [文件路径] # args : 文件的绝对路径 # notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。 from __future__ import division import sys import os import getopt import logging import logging.config import Queue import threadpool import threading from threading import Thread from multiprocessing.dummy import Pool as ThreadPool import chardet import curses import time import locale locale.setlocale(locale.LC_ALL, "") global logger global stdscr global pool stdscr = curses.initscr() def GBK2UTF8(filename): threadName = threading.currentThread().getName() f = open(filename, 'rb') s = f.read() f.close() encodingName = chardet.detect(s)['encoding'] str = ""; if (encodingName.startswith('GB')): # GBK码,需要转换 try: gbkContent = s.decode(encodingName) utf8Content = gbkContent.encode('utf-8') f = open(filename, 'w') f.write(utf8Content) f.close() except UnicodeDecodeError: str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName) # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName)) # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason)) pass str = "%s: %s, %s 转换done" % (threadName, filename, encodingName) else: # 已经是UTF-8不需要转换 str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName) return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str} def initLogger(): global logger # 日志初始化 LOG_FILENAME = 'logging.conf' logging.config.fileConfig(LOG_FILENAME) logger = logging.getLogger("GBK2UTF8") # 测试代码 # logger.debug("debug message") # logger.info("info message") # logger.warn("warn message") # logger.error("error message") # logger.critical("critical message") def main(): initLogger() shortargs = 's:d' longargs = ['src=', 'dest'] try: opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" # usage() print "Usage: python gbk2utf8.py -s [file full path]" return # sys.exit(2) srcPath = None destPath = None for o, a in opts: if o in ("-s", "--src"): srcPath = a elif o in ("-d", "--dest"): destPath = a else: assert False, "unhandled option" if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)): doWork(srcPath) def doWork(sPath): # Make the Pool of workers global pool pool = threadpool.ThreadPool(10) extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl'] i = 0 arrFiles = [] for root, dirs, files in os.walk(sPath): for file in files: # print root # print file i = i+1 sFilePath = root + os.sep + file extension = os.path.splitext(sFilePath)[1][1:] if (extension in extFilters): arrFiles.append(sFilePath) else: logger.info('Skipping %s' % sFilePath) print 'waiting...job' curses.noecho() curses.cbreak() requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result) [pool.putRequest(req) for req in requests] #close the pool and wait for the work to finish pool.wait() curses.nocbreak() curses.echo() curses.endwin() print 'end job' def print_result(request, result): try: idx = 0 for t in pool.workers: idx = idx+1 if(t.getName() == result["tName"]): break if idx > 0: y, x = stdscr.getmaxyx() # stdscr.deleteln() text = result["result"] textLen = len(text) text = text.ljust(x) stdscr.addstr(idx, 0, text) stdscr.refresh() logger.info(text) except curses.error: pass if __name__ == '__main__': main()
多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)
原文地址:http://blog.csdn.net/vieri_ch/article/details/44831783