Python读书笔记009:文本统计
程序员文章站
2022-07-11 09:45:59
...
文本文件的统计数据:
>>> len(s)
46
>>> s.split()
['A', 'long', 'time', 'ago,', 'in', 'a', 'galaxy', 'far,', 'far', 'away...']
>>> t = ' a long time ago in a galaxy far far away'
>>> t.split()
['a', 'long', 'time', 'ago', 'in', 'a', 'galaxy', 'far', 'far', 'away']
>>> len(t.split())
10
>>> set(t.split())
{'in', 'away', 'ago', 'far', 'a', 'galaxy', 'time', 'long'}
>>> len(set(t.split()))
8
保留想要的字母
将字符串转换成小写:
>>> s = "I'd like a copy!"
>>> s.lower()
"i'd like a copy!"
删除不想要的字符:
>>> s = "I'd like a copy!"
>>> s.replace('!','')
"I'd like a copy"
>>> s.replace("'",'')
'Id like a copy!'
>>> s.replace("'",' ')
'I d like a copy!'
keep = {'a', 'b', 'c', 'd', 'e', 'f', \
'g', 'h', 'i', 'j', 'k', 'l', \
'm', 'n', 'o', 'p', 'q', 'r', \
's', 't', 'u', 'v', 'w', 'x', \
'y', 'z', ' ', '-', "'"}
def normalize(s):
'''
Convert s to a normatlized string
'''
result = ''
for c in s.lower():
if c in keep:
result +=c
return result
>>> s = "I'd like a copy!"
>>> normalize(s)
"i'd like a copy"
文本统计:
keep = {'a', 'b', 'c', 'd', 'e', 'f', \
'g', 'h', 'i', 'j', 'k', 'l', \
'm', 'n', 'o', 'p', 'q', 'r', \
's', 't', 'u', 'v', 'w', 'x', \
'y', 'z', ' ', '-', "'"}
def normalize(s):
'''
Convert s to a normatlized string
'''
result = ''
for c in s.lower():
if c in keep:
result +=c
return result
def make_freq_dict(s):
'''
Returns a dictionary whose keys
are the words of s, and whose
value are the counts of those
words.
'''
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] +=1
else:
d[w] =1
return d
def print_file_stats(fname):
'''
Print statistics for the given file.
'''
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('\n')
d = make_freq_dict(s)
num_words = sum(d[w] for w in d)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file '%s' has" % frame)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
print("\nThe top 10 most frequent words are:")
i=1
for count, word in lst[:10]:
print('%2s. %2s %s' %(i, count, word))
i +=1
>>> frame="e://Python//The Babes.txt"
>>> print_file_stats(frame)
The file 'e://Python//The Babes.txt' has
148319 characters
3118 lines
23817 words
The top 10 most frequent words are:
1. 1253 the
2. 746 and
3. 675 to
4. 657 of
5. 496 her
6. 436 a
7. 383 in
8. 352 she
9. 261 you
10. 259 daph
下一篇: 今天老爸和老妈吵架了