欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python读书笔记009:文本统计

程序员文章站 2022-07-11 09:45:59
...

文本文件的统计数据:

>>> len(s)
46
>>> s.split()
['A', 'long', 'time', 'ago,', 'in', 'a', 'galaxy', 'far,', 'far', 'away...']
>>> t = ' a long time ago in a galaxy far far away'
>>> t.split()
['a', 'long', 'time', 'ago', 'in', 'a', 'galaxy', 'far', 'far', 'away']
>>> len(t.split())
10
>>> set(t.split())
{'in', 'away', 'ago', 'far', 'a', 'galaxy', 'time', 'long'}
>>> len(set(t.split()))
8

保留想要的字母

将字符串转换成小写:

>>> s = "I'd like a copy!"
>>> s.lower()
"i'd like a copy!"

删除不想要的字符:


>>> s = "I'd like a copy!"
>>> s.replace('!','')
"I'd like a copy"
>>> s.replace("'",'')
'Id like a copy!'
>>> s.replace("'",' ')
'I d like a copy!'

keep = {'a', 'b', 'c', 'd', 'e', 'f', \
        'g', 'h', 'i', 'j', 'k', 'l', \
        'm', 'n', 'o', 'p', 'q', 'r', \
        's', 't', 'u', 'v', 'w', 'x', \
        'y', 'z', ' ', '-', "'"}

def normalize(s):
    '''
    Convert s to a normatlized string
    '''
    result = ''
    for c in s.lower():
        if c in keep:
            result +=c
    return result
>>> s = "I'd like a copy!"
>>> normalize(s)
"i'd like a copy"

文本统计:

keep = {'a', 'b', 'c', 'd', 'e', 'f', \
        'g', 'h', 'i', 'j', 'k', 'l', \
        'm', 'n', 'o', 'p', 'q', 'r', \
        's', 't', 'u', 'v', 'w', 'x', \
        'y', 'z', ' ', '-', "'"}

def normalize(s):
    '''
    Convert s to a normatlized string
    '''
    result = ''
    for c in s.lower():
        if c in keep:
            result +=c
    return result

def make_freq_dict(s):
    '''
    Returns a dictionary whose keys
    are the words of s, and whose
    value are the counts of those
    words.
    '''
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] +=1
        else:
            d[w] =1
    return d

def print_file_stats(fname):
    '''
    Print statistics for the given file.
    '''
    s = open(fname,'r').read()
    num_chars = len(s)
    num_lines = s.count('\n')
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)

    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()

    print("The file '%s' has" % frame)
    print("    %s characters" % num_chars)
    print("    %s lines"      % num_lines)
    print("    %s words"      % num_words)
    print("\nThe top 10 most frequent words are:")

    i=1
    for count, word in lst[:10]:
        print('%2s. %2s %s' %(i, count, word))
        i +=1





>>> frame="e://Python//The Babes.txt"
>>> print_file_stats(frame)
The file 'e://Python//The Babes.txt' has
    148319 characters
    3118 lines
    23817 words

The top 10 most frequent words are:
 1. 1253 the
 2. 746 and
 3. 675 to
 4. 657 of
 5. 496 her
 6. 436 a
 7. 383 in
 8. 352 she
 9. 261 you
10. 259 daph