欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

部分代码4

程序员文章站 2024-03-20 10:28:46
...

#!/usr/bin/env python
#-- coding:utf-8 --
#author: Enoch time:2018/10/30 0030

import re
import time
from collections import Counter

###################################################################################
#Name:count_words
#Inputs:file name,the first n words, stopfile name
#outputs:None
#Author: Thomas
#Date:2018.10.22
###################################################################################
def CountVerbPre(file_name,verbName,preName):
dicNum = {}
totalNum = 0

t0 = time.clock()
with open(file_name) as f:
    txt = f.read()
txt = txt.lower()
txt = re.sub(r'\s+',' ',txt)
pword = r'(([a-z]+ )+[a-z]+)'  # extract sentence
pattern = re.compile(pword)
sentence = pattern.findall(txt)
txt = ','.join([sentence[m][0] for m in range(len(sentence))])
pattern = "[a-z]+[0-9]*"
for i in range(1):
    pattern += "[\s|,][a-z]+[0-9]*"
wordList = []
for i in range(2):
    if( i == 0 ):
        tempList = re.findall(pattern, txt)
    else:
        wordpattern = "[a-z]+[0-9]*"
        txt = re.sub(wordpattern, '', txt, 1).strip()
        tempList = re.findall(pattern, txt)
    wordList += tempList

tempc = Counter(wordList)
with open(preName) as f:
    preTxt = f.read()
preList = preTxt.split('\n')
verbDic = {}
with open(verbName) as f:
    for line in f.readlines():
        key,value = line.split(' -> ')
        for tverb in value.replace('\n','').split(','):
            verbDic[tverb] = key
        verbDic[key] = key
for phrase in tempc.keys():
    if(',' not in phrase):
        totalNum += 1
        verb, pre = phrase.split(' ')
        if (verb in verbDic.keys() and pre in preList):
            normPhrase = verbDic[verb] + ' ' + pre
            if (normPhrase in dicNum.keys()):
                dicNum[normPhrase] += tempc[phrase]
            else:
                dicNum[normPhrase] = tempc[phrase]

dicNum = sorted(dicNum.items(), key=lambda k: k[0])
dicNum = sorted(dicNum, key=lambda k: k[1], reverse=True)
t1 = time.clock()
for letter, fre in dicNum[:2]:
    print("|\t{:15}|{:<11.2%}|".format(letter, fre / totalNum))
print(t1 - t0)

CountVerbPre(’…/gone_with_the_wind.txt’,’…/Verbs.txt’,’…/prepositions.txt’)

相关标签: ASE