英文单词统计
程序员文章站
2024-02-23 23:29:58
...
前言
实现使用了状态机和环形缓冲区统计英文单词数量,状态机分为在单词外部和单词内部两个状态,‘ ’ ’, ‘ - ’这种连接符统计为一个单词,换行时使用的‘ - ’的连接符也统计为一个单词。
一、代码
代码如下(示例):
#include <stdio.h>
#include <string.h>
#define ABS(a, b) ((a) > (b))?(((a) - (b)):((b) - (a)))
#define RW_OFFSET(r, w, size) (((w) >= (r))?((w)-(r)):((w)+(size)-(r)))
#define RW_IDX_INC(rw, size) (((rw) >= (size - 1))?(rw = 0):(rw++))
#define RW_IDX_NEXT(rw, cnt, size) (((rw + cnt%size) >= (size))?(cnt%size - 1):(rw + cnt%size))
#define RW_IDX_LAST(rw, cnt, size) (((int)(rw - cnt%size) < (0))?(size - cnt%size):(rw - cnt%size))
#define MAX_SIZE (10u)
typedef enum
{
OUT_WORD = 0,
IN_WORD = 1,
} STATE_MACH;
STATE_MACH state_mach = OUT_WORD;
/** 判断是否是字母
*** 输入字符 ch
*** return 0:不是字母,1:是字母
*/
int is_alphabet(char ch)
{
int ret = 0;
if ((('a' <= ch) && ('z' >= ch)) || (('A' <= ch) && ('Z' >= ch)))
{
ret = 1;
}
return ret;
}
int main(int argc, char* argv[])
{
char ch;
char ch_arr[MAX_SIZE];
int r_idx = 0, w_idx = 0;
int sum_word = 0;
FILE *fp = fopen(argv[1], "r");
if (NULL == fp)
{
printf("%s[%d] fopen file %s is failed\n", __FUNCTION__, __LINE__, argv[1]);
return -1;
}
fgets(ch_arr, MAX_SIZE, fp);
r_idx = 0;
w_idx = strlen(ch_arr);
while (EOF != ch_arr[r_idx])
{
while ((RW_OFFSET(r_idx, w_idx, MAX_SIZE) < (MAX_SIZE / 2))
&& (EOF != ch_arr[RW_IDX_LAST(w_idx, 1, MAX_SIZE)])) /**< 缓冲数据保持在一定数量 */
{
ch_arr[w_idx] = fgetc(fp);
RW_IDX_INC(w_idx, MAX_SIZE);
}
switch (state_mach)
{
case OUT_WORD:
if (1 == is_alphabet(ch_arr[r_idx]))
{
state_mach = IN_WORD;
sum_word++;
}
break;
case IN_WORD:
if (0 == is_alphabet(ch_arr[r_idx]))
{
if (('\'' == ch_arr[r_idx]) || ('-' == ch_arr[r_idx]))
{
if ((RW_OFFSET(r_idx, w_idx, MAX_SIZE) > 0) && (is_alphabet(ch_arr[RW_IDX_NEXT(r_idx, 1, MAX_SIZE)]))) /**< 连接符 */
{
RW_IDX_INC(r_idx, MAX_SIZE);
break;
}
else if (('-' == ch_arr[r_idx])
&& (RW_OFFSET(r_idx, w_idx, MAX_SIZE) > 2)
&& (0x0d == ch_arr[RW_IDX_NEXT(r_idx, 1, MAX_SIZE)])
&& (0x0a == ch_arr[RW_IDX_NEXT(r_idx, 2, MAX_SIZE)])
&& (is_alphabet(ch_arr[RW_IDX_NEXT(r_idx, 3, MAX_SIZE)]))) /**< 行末尾连接符 */
{
RW_IDX_INC(r_idx, MAX_SIZE);
RW_IDX_INC(r_idx, MAX_SIZE);
RW_IDX_INC(r_idx, MAX_SIZE);
break;
}
}
state_mach = OUT_WORD;
}
break;
default:
break;
}
RW_IDX_INC(r_idx, MAX_SIZE);
}
printf("%s[%d] file %s word has %d\n", __FUNCTION__, __LINE__, argv[1], sum_word);
}