python之pypinyin

时间：2018-10-16 22:00:26 阅读：323 评论：0 收藏：0 [点我收藏+]

python 汉字拼音库　pypinyin

这个库还是很好用的，这个库还是很简单的，中文注解，下面是源码，看注释就可以大致明白方法的意思

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

from copy import deepcopy
from itertools import chain

from pypinyin.compat import text_type, callable_check
from pypinyin.constants import (
    PHRASES_DICT, PINYIN_DICT,
    RE_HANS, Style
)
from pypinyin.contrib import mmseg
from pypinyin.utils import simple_seg, _replace_tone2_style_dict_to_default
from pypinyin.style import auto_discover, convert as convert_style

auto_discover()


def seg(hans):
    hans = simple_seg(hans)
    ret = []
    for x in hans:
        if not RE_HANS.match(x):   # 没有拼音的字符，不再参与二次分词
            ret.append(x)
        elif PHRASES_DICT:
            ret.extend(list(mmseg.seg.cut(x)))
        else:   # 禁用了词语库，不分词
            ret.append(x)
    return ret


def load_single_dict(pinyin_dict, style=‘default‘):
    """载入用户自定义的单字拼音库

    :param pinyin_dict: 单字拼音库。比如： ``{0x963F: u"ā,ē"}``
    :param style: pinyin_dict 参数值的拼音库风格. 支持 ‘default‘, ‘tone2‘
    :type pinyin_dict: dict
    """
    if style == ‘tone2‘:
        for k, v in pinyin_dict.items():
            v = _replace_tone2_style_dict_to_default(v)
            PINYIN_DICT[k] = v
    else:
        PINYIN_DICT.update(pinyin_dict)

    mmseg.retrain(mmseg.seg)


def load_phrases_dict(phrases_dict, style=‘default‘):
    """载入用户自定义的词语拼音库

    :param phrases_dict: 词语拼音库。比如： ``{u"阿爸": [[u"ā"], [u"bà"]]}``
    :param style: phrases_dict 参数值的拼音库风格. 支持 ‘default‘, ‘tone2‘
    :type phrases_dict: dict
    """
    if style == ‘tone2‘:
        for k, value in phrases_dict.items():
            v = [
                list(map(_replace_tone2_style_dict_to_default, pys))
                for pys in value
            ]
            PHRASES_DICT[k] = v
    else:
        PHRASES_DICT.update(phrases_dict)

    mmseg.retrain(mmseg.seg)


def to_fixed(pinyin, style, strict=True):
    """根据拼音风格格式化带声调的拼音.

    :param pinyin: 单个拼音
    :param style: 拼音风格
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母
    :return: 根据拼音风格格式化后的拼音字符串
    :rtype: unicode
    """
    return convert_style(pinyin, style=style, strict=strict, default=pinyin)


def _handle_nopinyin_char(chars, errors=‘default‘):
    """处理没有拼音的字符"""
    if callable_check(errors):
        return errors(chars)

    if errors == ‘default‘:
        return chars
    elif errors == ‘ignore‘:
        return None
    elif errors == ‘replace‘:
        if len(chars) > 1:
            return ‘‘.join(text_type(‘%x‘ % ord(x)) for x in chars)
        else:
            return text_type(‘%x‘ % ord(chars))


def handle_nopinyin(chars, errors=‘default‘):
    py = _handle_nopinyin_char(chars, errors=errors)
    if not py:
        return []
    if isinstance(py, list):
        return py
    else:
        return [py]


def single_pinyin(han, style, heteronym, errors=‘default‘, strict=True):
    """单字拼音转换.

    :param han: 单个汉字
    :param errors: 指定如何处理没有拼音的字符，详情请参考
                   :py:func:`~pypinyin.pinyin`
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母
    :return: 返回拼音列表，多音字会有多个拼音项
    :rtype: list
    """
    num = ord(han)
    # 处理没有拼音的字符
    if num not in PINYIN_DICT:
        return handle_nopinyin(han, errors=errors)

    pys = PINYIN_DICT[num].split(‘,‘)  # 字的拼音列表
    if not heteronym:
        return [to_fixed(pys[0], style, strict=strict)]

    # 输出多音字的多个读音
    # 临时存储已存在的拼音，避免多音字拼音转换为非音标风格出现重复。
    # TODO: change to use set
    # TODO: add test for cache
    py_cached = {}
    pinyins = []
    for i in pys:
        py = to_fixed(i, style, strict=strict)
        if py in py_cached:
            continue
        py_cached[py] = py
        pinyins.append(py)
    return pinyins


def phrase_pinyin(phrase, style, heteronym, errors=‘default‘, strict=True):
    """词语拼音转换.

    :param phrase: 词语
    :param errors: 指定如何处理没有拼音的字符
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母
    :return: 拼音列表
    :rtype: list
    """
    py = []
    if phrase in PHRASES_DICT:
        py = deepcopy(PHRASES_DICT[phrase])
        for idx, item in enumerate(py):
            py[idx] = [to_fixed(item[0], style=style, strict=strict)]
    else:
        for i in phrase:
            single = single_pinyin(i, style=style, heteronym=heteronym,
                                   errors=errors, strict=strict)
            if single:
                py.append(single)
    return py


def _pinyin(words, style, heteronym, errors, strict=True):
    """
    :param words: 经过分词处理后的字符串，只包含中文字符或只包含非中文字符，
                  不存在混合的情况。
    """
    pys = []
    # 初步过滤没有拼音的字符
    if RE_HANS.match(words):
        pys = phrase_pinyin(words, style=style, heteronym=heteronym,
                            errors=errors, strict=strict)
        return pys

    py = handle_nopinyin(words, errors=errors)
    if py:
        pys.append(py)
    return pys


def pinyin(hans, style=Style.TONE, heteronym=False,
           errors=‘default‘, strict=True):
    """将汉字转换为拼音.

    :param hans: 汉字字符串( ``‘你好吗‘`` )或列表( ``[‘你好‘, ‘吗‘]`` ).
                 可以使用自己喜爱的分词模块对字符串进行分词处理,
                 只需将经过分词处理的字符串列表传进来就可以了。
    :type hans: unicode 字符串或字符串列表
    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.TONE` 风格。
                  更多拼音风格详见 :class:`~pypinyin.Style`
    :param errors: 指定如何处理没有拼音的字符

                   * ``‘default‘``: 保留原始字符
                   * ``‘ignore‘``: 忽略该字符
                   * ``‘replace‘``: 替换为去掉 ``\\u`` 的 unicode 编码字符串
                     (``‘\\u90aa‘`` => ``‘90aa‘``)
                   * callable 对象: 回调函数之类的可调用对象。如果 ``errors``
                     参数 的值是个可调用对象，那么程序会回调这个函数:
                     ``func(char)``::

                         def foobar(char):
                             return ‘a‘
                         pinyin(‘あ‘, errors=foobar)

    :param heteronym: 是否启用多音字
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母，详见 :ref:`strict`
    :return: 拼音列表
    :rtype: list

    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常

    Usage::

      >>> from pypinyin import pinyin, Style
      >>> import pypinyin
      >>> pinyin(‘中心‘)
      [[‘zhōng‘], [‘xīn‘]]
      >>> pinyin(‘中心‘, heteronym=True)  # 启用多音字模式
      [[‘zhōng‘, ‘zhòng‘], [‘xīn‘]]
      >>> pinyin(‘中心‘, style=Style.FIRST_LETTER)  # 设置拼音风格
      [[‘z‘], [‘x‘]]
      >>> pinyin(‘中心‘, style=Style.TONE2)
      [[‘zho1ng‘], [‘xi1n‘]]
      >>> pinyin(‘中心‘, style=Style.CYRILLIC)
      [[‘чжун1‘], [‘синь1‘]]
    """
    # 对字符串进行分词处理
    if isinstance(hans, text_type):
        han_list = seg(hans)
    else:
        han_list = chain(*(seg(x) for x in hans))
    pys = []
    for words in han_list:
        pys.extend(_pinyin(words, style, heteronym, errors, strict=strict))
    return pys


def slug(hans, style=Style.NORMAL, heteronym=False, separator=‘-‘,
         errors=‘default‘, strict=True):
    """生成 slug 字符串.

    :param hans: 汉字
    :type hans: unicode or list
    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
                  更多拼音风格详见 :class:`~pypinyin.Style`
    :param heteronym: 是否启用多音字
    :param separstor: 两个拼音间的分隔符/连接符
    :param errors: 指定如何处理没有拼音的字符，详情请参考
                   :py:func:`~pypinyin.pinyin`
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母，详见 :ref:`strict`
    :return: slug 字符串.

    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常

    ::

      >>> import pypinyin
      >>> from pypinyin import Style
      >>> pypinyin.slug(‘中国人‘)
      ‘zhong-guo-ren‘
      >>> pypinyin.slug(‘中国人‘, separator=‘ ‘)
      ‘zhong guo ren‘
      >>> pypinyin.slug(‘中国人‘, style=Style.FIRST_LETTER)
      ‘z-g-r‘
      >>> pypinyin.slug(‘中国人‘, style=Style.CYRILLIC)
      ‘чжун1-го2-жэнь2‘
    """
    return separator.join(chain(*pinyin(hans, style=style, heteronym=heteronym,
                                        errors=errors, strict=strict)
                                ))


def lazy_pinyin(hans, style=Style.NORMAL, errors=‘default‘, strict=True):
    """不包含多音字的拼音列表.

    与 :py:func:`~pypinyin.pinyin` 的区别是返回的拼音是个字符串，
    并且每个字只包含一个读音.

    :param hans: 汉字
    :type hans: unicode or list
    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
                  更多拼音风格详见 :class:`~pypinyin.Style`。
    :param errors: 指定如何处理没有拼音的字符，详情请参考
                   :py:func:`~pypinyin.pinyin`
    :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母，详见 :ref:`strict`
    :return: 拼音列表(e.g. ``[‘zhong‘, ‘guo‘, ‘ren‘]``)
    :rtype: list

    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常

    Usage::

      >>> from pypinyin import lazy_pinyin, Style
      >>> import pypinyin
      >>> lazy_pinyin(‘中心‘)
      [‘zhong‘, ‘xin‘]
      >>> lazy_pinyin(‘中心‘, style=Style.TONE)
      [‘zhōng‘, ‘xīn‘]
      >>> lazy_pinyin(‘中心‘, style=Style.FIRST_LETTER)
      [‘z‘, ‘x‘]
      >>> lazy_pinyin(‘中心‘, style=Style.TONE2)
      [‘zho1ng‘, ‘xi1n‘]
      >>> lazy_pinyin(‘中心‘, style=Style.CYRILLIC)
      [‘чжун1‘, ‘синь1‘]
    """
    return list(chain(*pinyin(hans, style=style, heteronym=False,
                              errors=errors, strict=strict)))

View Code

在写代码的时候还是会经常用到的，比如商品排列，姓名按照首字母罗列

１．先讲讲简单的方法解析

from pypinyin import pinyin, lazy_pinyin, Style, load_phrases_dict, load_single_dict
from pypinyin.style import register

print(pinyin(‘你好‘))  # [[‘nǐ‘], [‘hǎo‘]]
print(pinyin(‘中心‘, heteronym=True))  # 启用多音字模式  # [[‘zhōng‘, ‘zhòng‘], [‘xīn‘]]
print(pinyin(‘中心‘, style=Style.FIRST_LETTER))  # 设置拼音风格，第一个字母 [[‘z‘], [‘x‘]]
print(pinyin(‘中心‘, style=Style.TONE2, heteronym=True))  # [[‘zho1ng‘, ‘zho4ng‘], [‘xi1n‘]]
print(lazy_pinyin(‘中心‘))  # 不考虑多音字的情况 # [‘zhong‘, ‘xin‘]

##########处理不包含拼音的字符
# default (默认行为): 不做任何处理，原样返回:
print(lazy_pinyin(‘你好☆☆‘))  # [‘ni‘, ‘hao‘, ‘☆☆‘]
# ignore : 忽略该字符
print(lazy_pinyin(‘你好☆☆‘, errors=‘ignore‘))  # [‘ni‘, ‘hao‘]
# replace : 替换为去掉 \u 的 unicode 编码
print(lazy_pinyin(‘你好☆☆‘, errors=‘replace‘))  # [‘ni‘, ‘hao‘, ‘26062606‘]
# callable 对象 : 提供一个回调函数，接受无拼音字符(串)作为参数, 支持的返回值类型: unicode 或 list ([unicode, …]) 或 None 。
print(lazy_pinyin(‘你好☆☆‘, errors=lambda x: ‘star‘))  # [‘ni‘, ‘hao‘, ‘star‘]

########### 自定义拼音库
print(lazy_pinyin(‘还没‘, style=Style.TONE2))
load_phrases_dict({‘桔子‘: [[‘jú‘], [‘zǐ‘]]})  # 增加 "桔子" 词组，可以自己定义
print(lazy_pinyin(‘桔子‘, style=Style.TONE2))

load_single_dict({ord(‘还‘): ‘hái,huán‘})  # 调整 "还" 字的拼音顺序
print(lazy_pinyin(‘还没‘, style=Style.TONE2))


###########自定义拼音风格
@register(‘kiss‘)
def kiss(mypinyin, **kwargs):
    return ‘?? {0}‘.format(mypinyin)


print(lazy_pinyin(‘么么哒‘, style=‘kiss‘))

View Code

２．一个简单功能ｄｅｍｏ：将列表中的value值，按照首字母大小写排列

[{"name":‘啊你‘},{"name":‘比拟‘},{"name":‘赐你‘}]

from pypinyin import lazy_pinyin
def hansToPinYinName(nameMaps, nameKey=‘name‘):
    pyin_name_tuples = [[‘‘.join(lazy_pinyin(nameMap[‘name‘])), nameMap] for nameMap in nameMaps]
    print(‘pyin_name_tuples‘,pyin_name_tuples)
    superDict = {}

    ss = sorted(pyin_name_tuples, key=lambda x:x[0])
    print(‘ss‘,ss)

    for item in ss:
        print(‘item‘,item)
        firChar = item[0][0]
        nameMap = item[1]
        if firChar not in superDict.keys():
            superDict[firChar] = {‘firChar‘:firChar, ‘students‘:[nameMap]}
        else:
            superDict[firChar][‘students‘].append(nameMap)
    return list(superDict.values())

res=hansToPinYinName([{"name":‘阿狗‘},{"name":‘弼马温‘},{"name":‘次郎‘}])
print(res)


打印结果：
pyin_name_tuples [[‘agou‘, {‘name‘: ‘阿狗‘}], [‘bimawen‘, {‘name‘: ‘弼马温‘}], [‘cilang‘, {‘name‘: ‘次郎‘}]]
ss [[‘agou‘, {‘name‘: ‘阿狗‘}], [‘bimawen‘, {‘name‘: ‘弼马温‘}], [‘cilang‘, {‘name‘: ‘次郎‘}]]
item [‘agou‘, {‘name‘: ‘阿狗‘}]
item [‘bimawen‘, {‘name‘: ‘弼马温‘}]
item [‘cilang‘, {‘name‘: ‘次郎‘}]
[{‘firChar‘: ‘a‘, ‘students‘: [{‘name‘: ‘阿狗‘}]}, {‘firChar‘: ‘b‘, ‘students‘: [{‘name‘: ‘弼马温‘}]}, {‘firChar‘: ‘c‘, ‘students‘: [{‘name‘: ‘次郎‘}]}]

４．简单一点的例子

def PYinNameMap(nameMaps, nameKey=‘name‘):
    pyin_names= [[‘‘.join(lazy_pinyin(nameMap[‘name‘])), nameMap] for nameMap in nameMaps]
    print(‘pyin_names‘,pyin_names)
    pyin_names_sorted = sorted(pyin_names, key=lambda x:x[0])
    print(‘pyin_names_sorted‘,pyin_names_sorted)
    return [pyin_name[1] for pyin_name in pyin_names_sorted]

res=PYinNameMap([{"name":‘阿狗‘},{"name":‘弼马温‘},{"name":‘次郎‘}])
print(res)

打印结果：
pyin_names_sorted [[‘agou‘, {‘name‘: ‘阿狗‘}], [‘bimawen‘, {‘name‘: ‘弼马温‘}], [‘cilang‘, {‘name‘: ‘次郎‘}]]
[{‘name‘: ‘阿狗‘}, {‘name‘: ‘弼马温‘}, {‘name‘: ‘次郎‘}]

这两个例子都是用到　lazy_pinyin　方法，其次就是一些数据处理方式

python之pypinyin

标签：tle 代码没有 foo type 避免排列模块 star

原文地址：https://www.cnblogs.com/zzy-9318/p/9800643.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

python之pypinyin

python 汉字拼音库 pypinyin

这个库还是很好用的，这个库还是很简单的，中文注解，下面是源码，看注释就可以大致明白方法的意思

python 汉字拼音库　pypinyin