部分代码3
程序员文章站
2024-03-20 10:29:22
...
#!/usr/bin/env python
#-*- coding:utf-8 -*-
#author: Enoch time:2018/10/30 0030
import re
import time
from collections import Counter
import os
import sys
import cProfile
###################################################################################
#Name:count_words
#Inputs:file name,the first n words, stopfile name
#outputs:None
#Author: Thomas
#Date:2018.10.22
###################################################################################
def CountPhrases(file_name,k):
totalNum = 0
t0 = time.clock()
with open(file_name) as f:
txt = f.read()
txt = txt.lower()
txt = re.sub(r'\s+',' ',txt)
pword = r'(([a-z]+ )+[a-z]+)' # extract sentence
pattern = re.compile(pword)
sentence = pattern.findall(txt)
txt = ','.join([sentence[m][0] for m in range(len(sentence))])
pattern = "[a-z]+[0-9]*"
for i in range(k-1):
pattern += "[\s|,][a-z]+[0-9]*"
wordList = []
for i in range(k):
if( i == 0 ):
tempList = re.findall(pattern, txt)
else:
wordpattern = "[a-z]+[0-9]*"
txt = re.sub(wordpattern, '', txt, 1).strip()
tempList = re.findall(pattern, txt)
wordList += tempList
tempc = Counter(wordList)
dicNum = {}
phrases = tempc.keys()
for phrase in phrases:
if (',' not in phrase):
dicNum[phrase] = tempc[phrase]
totalNum += tempc[phrase]
dicNum = sorted(dicNum.items(), key=lambda k: k[0])
dicNum = sorted(dicNum, key=lambda k: k[1], reverse=True)
t1 = time.clock()
for letter, fre in dicNum[:2]:
print("|\t{:15}|{:<11.2%}|".format(letter, fre / totalNum))
print(t1 - t0)
CountPhrases('../gone_with_the_wind.txt', 2)
上一篇: 二分查找几种小小的变形
下一篇: web前端学习笔记--Dom