码迷,mamicode.com
首页 > 编程语言 > 详细

多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)

时间:2015-04-02 15:09:24      阅读:214      评论:0      收藏:0      [点我收藏+]

标签:python   utf-8   文件编码转换   多线程   

# coding=utf-8
# author:Jeffrey Ma
# version:0.1
# build 2
# created on:2015年3月31日
# description:  1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码
#               2. 支持指定目录下所有的文件的转换,包括子目录中的文件
#               3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换
#               4. 支持只转换指定扩展名的编码
#               5. 支持多线程转换和控制台输出
#               6. 支持控制台显示线程池的状态
#               7. 支持日志记录
# usage: python gbk2utf8.py  -s [文件路径]
# args : 文件的绝对路径
# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。

from __future__ import division
import sys
import os
import getopt
import logging
import logging.config
import Queue
import threadpool
import threading
from threading import Thread
from multiprocessing.dummy import Pool as ThreadPool
import chardet
import curses
import time
import locale

locale.setlocale(locale.LC_ALL, "")

global logger
global stdscr
global pool

stdscr = curses.initscr()

def GBK2UTF8(filename):

    threadName = threading.currentThread().getName()

    f = open(filename, ‘rb‘)
    s = f.read()
    f.close()
    encodingName = chardet.detect(s)[‘encoding‘]

    str = "";
    if (encodingName.startswith(‘GB‘)):
        # GBK码,需要转换
        try:
            gbkContent = s.decode(encodingName)
            utf8Content = gbkContent.encode(‘utf-8‘)
            f = open(filename, ‘w‘)
            f.write(utf8Content)
            f.close()
        except UnicodeDecodeError:
            str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)
            # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))
            # logger.error(‘%s: decoe error %s‘ % (threadName, UnicodeDecodeError.reason))
            pass

        str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)
    else:
        # 已经是UTF-8不需要转换
        str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)

    return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}

def initLogger():
    global logger
    # 日志初始化
    LOG_FILENAME = ‘logging.conf‘
    logging.config.fileConfig(LOG_FILENAME)
    logger = logging.getLogger("GBK2UTF8")
    # 测试代码
    # logger.debug("debug message")
    # logger.info("info message")
    # logger.warn("warn message")
    # logger.error("error message")
    # logger.critical("critical message")

def main():
    initLogger()
    shortargs = ‘s:d‘
    longargs = [‘src=‘, ‘dest‘]
    try:
        opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        # usage()
        print "Usage: python gbk2utf8.py -s [file full path]"
        return
        # sys.exit(2)

    srcPath = None
    destPath = None
    for o, a in opts:
        if o in ("-s", "--src"):
            srcPath = a
        elif o in ("-d", "--dest"):
            destPath = a
        else:
            assert False, "unhandled option"

    if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):
        doWork(srcPath)

def doWork(sPath):
    # Make the Pool of workers
    global pool
    pool = threadpool.ThreadPool(10)

    extFilters = [‘xml‘, ‘java‘, ‘js‘, ‘txt‘, ‘css‘, ‘php‘, ‘html‘, ‘htm‘, ‘tpl‘]
    i = 0
    arrFiles = []
    for root, dirs, files in os.walk(sPath):
        for file in files:
            # print root
            # print file
            i = i+1
            sFilePath = root + os.sep + file
            extension = os.path.splitext(sFilePath)[1][1:]
            if (extension in extFilters):
                arrFiles.append(sFilePath)
            else:
                logger.info(‘Skipping %s‘ % sFilePath)
    print ‘waiting...job‘

    curses.noecho()
    curses.cbreak()
    requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)
    [pool.putRequest(req) for req in requests]

    #close the pool and wait for the work to finish
    pool.wait()
    curses.nocbreak()
    curses.echo()
    curses.endwin()

    print ‘end job‘

def print_result(request, result):
    try:
        idx = 0
        for t in pool.workers:
            idx = idx+1
            if(t.getName() == result["tName"]):
                break

        if idx > 0:
            y, x = stdscr.getmaxyx()
            # stdscr.deleteln()
            text = result["result"]
            textLen = len(text)
            text = text.ljust(x)
            stdscr.addstr(idx, 0, text)
            stdscr.refresh()
            logger.info(text)
    except curses.error:
        pass

if __name__ == ‘__main__‘:
    main()







# coding=utf-8
# author:Jeffrey Ma
# version:0.1
# build 2
# created on:2015年3月31日
# description:  1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码
#               2. 支持指定目录下所有的文件的转换,包括子目录中的文件
#               3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换
#               4. 支持只转换指定扩展名的编码
#               5. 支持多线程转换和控制台输出
#               6. 支持控制台显示线程池的状态
#               7. 支持日志记录
# usage: python gbk2utf8.py  -s [文件路径]
# args : 文件的绝对路径
# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。

from __future__ import division
import sys
import os
import getopt
import logging
import logging.config
import Queue
import threadpool
import threading
from threading import Thread
from multiprocessing.dummy import Pool as ThreadPool
import chardet
import curses
import time
import locale

locale.setlocale(locale.LC_ALL, "")

global logger
global stdscr
global pool

stdscr = curses.initscr()

def GBK2UTF8(filename):

    threadName = threading.currentThread().getName()

    f = open(filename, 'rb')
    s = f.read()
    f.close()
    encodingName = chardet.detect(s)['encoding']

    str = "";
    if (encodingName.startswith('GB')):
        # GBK码,需要转换
        try:
            gbkContent = s.decode(encodingName)
            utf8Content = gbkContent.encode('utf-8')
            f = open(filename, 'w')
            f.write(utf8Content)
            f.close()
        except UnicodeDecodeError:
            str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)
            # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))
            # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason))
            pass

        str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)
    else:
        # 已经是UTF-8不需要转换
        str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)

    return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}

def initLogger():
    global logger
    # 日志初始化
    LOG_FILENAME = 'logging.conf'
    logging.config.fileConfig(LOG_FILENAME)
    logger = logging.getLogger("GBK2UTF8")
    # 测试代码
    # logger.debug("debug message")
    # logger.info("info message")
    # logger.warn("warn message")
    # logger.error("error message")
    # logger.critical("critical message")

def main():
    initLogger()
    shortargs = 's:d'
    longargs = ['src=', 'dest']
    try:
        opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        # usage()
        print "Usage: python gbk2utf8.py -s [file full path]"
        return
        # sys.exit(2)

    srcPath = None
    destPath = None
    for o, a in opts:
        if o in ("-s", "--src"):
            srcPath = a
        elif o in ("-d", "--dest"):
            destPath = a
        else:
            assert False, "unhandled option"

    if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):
        doWork(srcPath)

def doWork(sPath):
    # Make the Pool of workers
    global pool
    pool = threadpool.ThreadPool(10)

    extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl']
    i = 0
    arrFiles = []
    for root, dirs, files in os.walk(sPath):
        for file in files:
            # print root
            # print file
            i = i+1
            sFilePath = root + os.sep + file
            extension = os.path.splitext(sFilePath)[1][1:]
            if (extension in extFilters):
                arrFiles.append(sFilePath)
            else:
                logger.info('Skipping %s' % sFilePath)
    print 'waiting...job'

    curses.noecho()
    curses.cbreak()
    requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)
    [pool.putRequest(req) for req in requests]

    #close the pool and wait for the work to finish
    pool.wait()
    curses.nocbreak()
    curses.echo()
    curses.endwin()

    print 'end job'

def print_result(request, result):
    try:
        idx = 0
        for t in pool.workers:
            idx = idx+1
            if(t.getName() == result["tName"]):
                break

        if idx > 0:
            y, x = stdscr.getmaxyx()
            # stdscr.deleteln()
            text = result["result"]
            textLen = len(text)
            text = text.ljust(x)
            stdscr.addstr(idx, 0, text)
            stdscr.refresh()
            logger.info(text)
    except curses.error:
        pass

if __name__ == '__main__':
    main()




多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)

标签:python   utf-8   文件编码转换   多线程   

原文地址:http://blog.csdn.net/vieri_ch/article/details/44831783

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!