[单链表]统计文本中英文单词出现次数,并输出前200个单词及其出现次数
程序员文章站
2022-05-28 19:38:04
...
统计文本中各英文单词出现次数,并按词频逆序排列。
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#define SIZE 50
typedef struct words
{
char str[SIZE]; //单词最长100个
int count;
struct words *next;
}Words;
// 初始化链表
Words* InitWord()
{
Words *headWord = (Words*)malloc(sizeof(Words));
strcpy(headWord->str,"");
headWord->count = 0;
headWord->next = NULL;
return headWord;
}
// 增加新单词
void AddWord(Words *headWord,const char *str)//头插入
{
Words *pos = headWord->next; //储存当前第一个结构体的指针
Words *newWord = (Words*)malloc(sizeof(Words));
strcpy(newWord->str,str);
headWord->next = newWord;
if(pos == NULL) newWord->next = NULL;//第一个单词,储存在链表尾部,next指向空
else newWord->next = pos;
newWord->count=1;
}
int CheckStr(Words *headWord,const char *str)//检查单词是否出现过,若出现单词数量加一,返回1,没出现返回0
{
Words *p = headWord->next;
if(headWord->next == NULL)
{
AddWord(headWord,str);
return 1;
}
while(strcmp(p->str,str) != 0)
{
p = p->next;
if(p == NULL) break;
}
if(p == NULL) return 0;//没有该单词,返回0标记
else
{
++(p->count); //已有单词,数量加一
return 1;
}
return -1; //异常出错
}
// 读单词,该函数每次读取一个单词
int ReadWord(FILE *fp,int *plen,char *str)
{
fseek(fp, *plen, SEEK_SET);
unsigned char ch;
//if( feof(fp) ) //文件结束
// {
// return 0;
// }
int flg_eof = 0;
int i=0; // i 表示index
while(fread(&ch,sizeof(char),1,fp) != 0) //注:isalpha(unsigned(ch)),需要强制转换
{
if(!isalpha(unsigned(ch)) && i == 0) continue;//排除单词 不是字母 无法读取问题
if(isalpha(unsigned(ch)) || ch == '\'')//类似I'm中的 ' 符号,有且只能有一个
{
flg_eof = 1; // 标记是否读过单词
str[i++] = ch;
}
else
{
break;
}
}
if(flg_eof == 0) return 0; //没有读文件,文件结尾了
str[i] = '\0';
*plen = ftell(fp); // 对于文件首的偏移字节数
return *plen;
}
// 读取文件,调用“读取单词函数”,每读取一个单词检查是否需要 添加新单词 或 计数+1
void ReadFile(Words *headWord,char *name)//文件操作,从文件中读取单词
{
FILE *fp ;
int len = 0; //标记文件指针在文件中的位置
if((fp= fopen(name,"r")) == NULL)
{
fp = fopen(name, "w+");/* 如果创建失败,新建一个文件 w+:可读写
文件存在则文件长度清为零,文件不存在时创建一个 */
}
char str[SIZE];
while(ReadWord(fp,&len,str) != 0)//fscanf(fp,"%s",str) != EOF
{
if(CheckStr(headWord,str) == 0) //没有该单词,新增一个
{
AddWord(headWord,str);
}
}
fclose(fp);
}
//单词交换顺序
void SwapWord(Words *posWord,Words *posNext)
{
// 交换单词
char tmpWord[SIZE]="";
strcpy(tmpWord ,posWord->str);
strcpy(posWord->str, posNext->str);
strcpy(posNext->str, tmpWord);
// 交换对应单词计数
int tmpCount;
tmpCount = posWord->count;
posWord->count = posNext->count;
posNext->count = tmpCount;
}
// 单词排序,按计数从大到小
void WordsSort(Words *headWord)
{
if(headWord->next == NULL) return;
Words *flg_word = NULL;
int flag;
while(flg_word != headWord->next)
{
flag = 0;
Words *pos = headWord->next;
Words *pos_next = pos->next;
while(pos_next != NULL)
{
if(pos->count < pos_next->count)
{
SwapWord(pos,pos_next);
flag = 1;
}
if(pos_next->next == flg_word) //仿冒泡的i<n-i-1. 相当于flg_word每次减一
{
flg_word = pos_next;
break; //可有可无,while中已有条件
}
pos = pos->next;
pos_next = pos_next->next;
}
if(flag == 0) break;
}
}
// 打印词频排行榜
void PrintWords(Words *headWord,const char *name)
{
Words *pMove = headWord->next;
if(headWord->next == NULL) return;
int count = 0;
printf("\t<<%s>>中出现的最高频率的单词:\n",name);
printf("\t单词:\t\t\t\t\t\t出现次数:\n");
while(pMove != NULL && count<200)
{
printf("\t%-50s%d\n",pMove->str,pMove->count);
pMove = pMove->next;
count++;
}
printf("\n");
}
//释放链表
void free_Word(Words* headNode)
{
Words *point = NULL;
while(headNode != NULL)
{
point = headNode;//指向所释放的空间
headNode = headNode->next;//指针后移
free(point);
}
}
int main()
{
Words *headWord = InitWord(); //初始化链表,创建表头
char bookName[][100] = {"小王子.txt","CountWords.cpp","test.txt","Harry Potter and The Half-Blood Prince.txt" };
//当前文件*.cpp文件 //哈利波特
ReadFile(headWord,bookName[0]); //文件操作
WordsSort(headWord); //排序
PrintWords(headWord,bookName[0]); //打印
free_Word(headWord); //释放链表
return 0;
}
文件下载:
小王子.txt
链接:https://wwa.lanzous.com/icWhOe8z34j
Harry Potter and The Half-Blood Prince.txt
链接:https://wwa.lanzous.com/inix8e8z33i