【python】入门学习（十）

时间：2014-09-04 16:44:29 阅读：326 评论：0 收藏：0 [点我收藏+]

标签：style blog color os io ar for art div

#入门学习系列的内容均是在学习《Python编程入门（第3版）》时的学习笔记

统计一个文本文档的信息，并输出出现频率最高的10个单词

#text.py
#保留的字符
keep = {‘a‘,‘b‘,‘c‘,‘d‘,‘e‘,‘f‘,‘g‘,‘h‘,‘i‘,‘j‘,‘k‘,‘l‘,‘m‘,‘n‘,‘o‘,‘p‘
        ‘q‘,‘r‘,‘s‘,‘t‘,‘u‘,‘v‘,‘w‘,‘x‘,‘y‘,‘z‘,‘ ‘,‘-‘,"‘"}
#将文本规范化 
def normalize(s): 
    """Convert s to a normalized string."""
    result = ‘‘
    for c in s.lower():
        if c in keep:
            result += c
    return result

#获取文本基本信息
def file_stats(fname):
    """Print statistics for the given file."""
    s = open(fname,‘r‘).read()
    num_chars = len(s)
    num_lines = s.count(‘\n‘)
    num_words = len(normalize(s).split())
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)

#将字符串转化为字典
def make_freq_dict(s):
    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d

#获取文本基本信息
def file_stats2(fname):
    """Print statistics for the given file."""
    s = open(fname,‘r‘).read()
    num_chars = len(s)
    num_lines = s.count(‘\n‘)
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)
    print("\nThe top 10 most frequent words are:")
    i = 1
    for count,word in lst[:99]:
        print(‘%2s. %4s %s‘ % (i, count, word))
        i += 1

>>> file_stats2(‘a.txt‘)
The file a.txt has:
  12927 characters
  297 lines
  1645 words

The top 10 most frequent words are:
 1.   62 to
 2.   62 the
 3.   47 is
 4.   42 a
 5.   41 of
 6.   40 it
 7.   36 that
 8.   35 and
 9.   32 as
10.   24 so

进一步完善的代码：

#text.py
#保留的字符
keep = {‘a‘,‘b‘,‘c‘,‘d‘,‘e‘,‘f‘,‘g‘,‘h‘,‘i‘,‘j‘,‘k‘,‘l‘,‘m‘,‘n‘,‘o‘,‘p‘
        ‘q‘,‘r‘,‘s‘,‘t‘,‘u‘,‘v‘,‘w‘,‘x‘,‘y‘,‘z‘,‘ ‘,‘-‘,"‘"}
#将文本规范化 
def normalize(s): 
    """Convert s to a normalized string."""
    result = ‘‘
    for c in s.lower():
        if c in keep:
            result += c
    return result

#获取文本基本信息
def file_stats(fname):
    """Print statistics for the given file."""
    s = open(fname,‘r‘).read()
    num_chars = len(s)
    num_lines = s.count(‘\n‘)
    num_words = len(normalize(s).split())
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)

#将字符串转化为字典
def make_freq_dict(s):
    """Return a dictionary whose keys are the words of s,and whose values are the counts of those words."""
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d

#获取文本基本信息
def file_stats2(fname):
    """Print statistics for the given file."""
    s = open(fname,‘r‘).read()
    num_chars = len(s)
    num_lines = s.count(‘\n‘)
    d = make_freq_dict(s)
    num_different_words = sum(d[w]/d[w] for w in d)
    num_words = sum(d[w] for w in d)
    words_average_length = sum(len(w) for w in d)/num_different_words
    num_once = sum(d[w] for w in d if d[w] == 1)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    print("The file %s has:" % fname)
    print("  %s characters" % num_chars)
    print("  %s lines" % num_lines)
    print("  %s words" % num_words)
    print("  %s words appreance one time" % num_once)
    print("  %s different words" % int(num_different_words))
    print("  %s average length" % words_average_length)
    print("\nThe top 10 most frequent words are:")
    i = 1
    for count,word in lst[:10]:
        print(‘%2s. %4s %s‘ % (i, count, word))
        i += 1

def main():
    file_stats2(‘a.txt‘)

if __name__==‘__main__‘:
    main()

>>> ================================ RESTART ================================
>>> 
The file a.txt has:
  12927 characters
  297 lines
  1645 words
  515 words appreance one time
  699 different words
  6.539341917024321 average length

The top 10 most frequent words are:
 1.   62 to
 2.   62 the
 3.   47 is
 4.   42 a
 5.   41 of
 6.   40 it
 7.   36 that
 8.   35 and
 9.   32 as
10.   24 so

【python】入门学习（十）

标签：style blog color os io ar for art div

原文地址：http://www.cnblogs.com/dplearning/p/3956242.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行