词法分析器的实现
程序员文章站
2022-05-28 17:05:58
...
文章链接:https://codemouse.online/archives/2020-03-13175439
词法分析器
结构体
typedef struct {
int type; // 词语类型
char *seman; // 词语
}Token_type;
类型定义
#define BEGIN 1 /* 定义begin的宏名*/
#define NUMB 2 /* 定义num.的宏名*/
#define IDEN 3 /* 定义id的宏名*/
#define PLUS 4 /* 定义'+'的宏名*/
#define MULT 5 /* 定义'*'的宏名*/
#define ASS 6 /* 定义':='的宏名*/
#define READ 7 /* 定义read的宏名:*/
#define WRITE 8 /* 定义write的宏名*/
#define SEMI 9 /* 定义';'的宏名.*/
#define OPEN 10 /* 定义'('的宏名*/
#define CLOSE 11 /* 定义') '的宏名*/
#define END 12 /* 定义end的宏名*/
#define EOF 13 /* 文件结束符的宏名*/
判断字符类型
#define is_end_of_input(ch) ((ch) == '\0')
#define is_lc_letter(ch) ('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch) ('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch) ('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch) ('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch) (is_letter(ch) || is_digit (ch))
#define is_operator(ch) ((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch) (!is_end_of_input(ch) && (ch) <= ' ')
全局变量
Token_type Token; //作为临时词语接收变量
char prog_file[4096] = { 0 }; // 文件内的所有文字
int fp = -1; // 字符下标
char ch; // 取出的字符
vector<Token_type> vec_Token; // 接收词语的容器
获取下一个字符
void next_char(void) {
ch = prog_file[++fp];
}
去除无用字符
void next_avail_char(void) {
next_char();
while (is_layout(ch)) {
next_char();
}
}
获取程序代码
void initFile()
{
char *p = prog_file;
FILE *file = fopen("toy.toyL", "r");
if (file == NULL)
{
perror("open file fail\n");
exit(-1);
}
int fileLen = 0;
fseek(file, 0, SEEK_END);
fileLen = ftell(file);
fseek(file, 0, SEEK_SET);
for (int i = 0; i < fileLen -1; i++)
{
*p++ = fgetc(file);
}
ch = prog_file[0];
printf("%s\n", prog_file);
fclose(file);
}
解析字符串
void recognize_name(void)
{
char *name = (char*)malloc(sizeof(char)*10);
int np = 0;
name[np++] = ch;
next_char();
while (is_letter(ch) || is_digit(ch))
{
name[np++] = ch;
next_char();
}
name[np] = '\0';
// 判断是变量还是关键字
if (!strcmp(name, "begin"))
Token.type = BEGIN;
else if (!strcmp(name, "end"))
Token.type = END;
else if (!strcmp(name, "read"))
Token.type = READ;
else if (!strcmp(name, "write"))
Token.type = WRITE;
else
Token.type = IDEN;
Token.seman = name;
fp--;
}
解析数字
void recognize_number(void)
{
char* digits = (char*)malloc(sizeof(char) * 10);
int dsp = 0;
digits[dsp++] = ch;
next_char();
while (is_digit(ch)) {
digits[dsp++] = ch; next_char();
}
digits[dsp] = '\0';
Token.type = NUMB;
Token.seman = digits;
fp--;
}
设置好词语结构体
void next_token(void)
{
next_avail_char();
if (is_digit(ch))
{
recognize_number();
vec_Token.push_back(Token);
return;
}
if (is_letter(ch))
{
recognize_name();
vec_Token.push_back(Token);
return;
}
char *str = (char*)malloc(sizeof(char) * 10);
switch (ch) {
case '+':
strcpy(str, "+");
Token.seman = str;
Token.type = PLUS; break;
case '*':
strcpy(str, "*");
Token.seman = str;
Token.type = MULT; break;
case ':':
next_char();
if (ch != '=')
error();
strcpy(str, ":=");
Token.seman = str;
Token.type = ASS; break;
case ';':
strcpy(str, ";");
Token.seman = str;
Token.type = SEMI; break;
case '(':
strcpy(str, "(");
Token.seman = str;
Token.type = OPEN; break;
case ')':
strcpy(str, ")");
Token.seman = str;
Token.type = CLOSE; break;
case '\0':
strcpy(str, "EOF");
Token.seman = str;
Token.type = EOF;
break;
default:
error();
}
vec_Token.push_back(Token);
}
将宏写入字符串
int print_hong(char *str,int label)
{
switch (label)
{
case BEGIN:
strcpy(str, "BEGIN");
return strlen("BEGIN");
break;
case NUMB:
strcpy(str, "NUMB");
return strlen("NUMB");
break;
case IDEN:
strcpy(str, "IDEN");
return strlen("IDEN");
break;
case PLUS:
strcpy(str, "PLUS");
return strlen("PLUS");
break;
case MULT:
strcpy(str, "MULT");
return strlen("MULT");
break;
case ASS:
strcpy(str, "ASS");
return strlen("ASS");
break;
case READ:
strcpy(str, "READ");
return strlen("READ");
break;
case WRITE:
strcpy(str, "WRITE");
return strlen("WRITE");
break;
case SEMI:
strcpy(str, "SEMI");
return strlen("SEMI");
break;
case OPEN:
strcpy(str, "OPEN");
return strlen("OPEN");
break;
case CLOSE:
strcpy(str, "CLOSE");
return strlen("CLOSE");
break;
case END:
strcpy(str, "END");
return strlen("END");
break;
case EOF:
strcpy(str, "EOF");
return strlen("EOF");
break;
}
}
主函数运行,将词语分析后打印
void main()
{
initFile();
while (ch != '\0')
{
next_token();
}
for (int i = 0; i < vec_Token.size(); i++)
{
char buf[1024] = { 0 };
int len = 0;
len = sprintf(buf, "[%d]\t \(", i + 1);
len += print_hong(buf + len, vec_Token[i].type);
sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
printf("%s\n",buf);
free(vec_Token[i].seman);
}
}
完整代码
// 练习.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
using namespace std;
typedef struct {
int type;
char *seman;
}Token_type;
#define BEGIN 1 /* 定义begin的宏名*/
#define NUMB 2 /* 定义num.的宏名*/
#define IDEN 3 /* 定义id的宏名*/
#define PLUS 4 /* 定义'+'的宏名*/
#define MULT 5 /* 定义'*'的宏名*/
#define ASS 6 /* 定义':='的宏名*/
#define READ 7 /* 定义read的宏名:*/
#define WRITE 8 /* 定义write的宏名*/
#define SEMI 9 /* 定义';'的宏名.*/
#define OPEN 10 /* 定义'('的宏名*/
#define CLOSE 11 /* 定义') '的宏名*/
#define END 12 /* 定义end的宏名*/
#define EOF 13 /* 文件结束符的宏名*/
#define is_end_of_input(ch) ((ch) == '\0')
#define is_lc_letter(ch) ('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch) ('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch) ('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch) ('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch) (is_letter(ch) || is_digit (ch))
#define is_operator(ch) ((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch) (!is_end_of_input(ch) && (ch) <= ' ')
Token_type Token;
char prog_file[4096] = { 0 };
int fp = -1;
char ch;
vector<Token_type> vec_Token;
void error()
{
exit(-2);
}
void next_char(void) {
ch = prog_file[++fp];
}
void next_avail_char(void) {
next_char();
while (is_layout(ch)) {
next_char();
}
}
// 先获取代码
void initFile()
{
char *p = prog_file;
FILE *file = fopen("toy.toyL", "r");
if (file == NULL)
{
perror("open file fail\n");
exit(-1);
}
int fileLen = 0;
fseek(file, 0, SEEK_END);
fileLen = ftell(file);
fseek(file, 0, SEEK_SET);
for (int i = 0; i < fileLen -1; i++)
{
*p++ = fgetc(file);
}
ch = prog_file[0];
printf("%s\n", prog_file);
fclose(file);
}
void recognize_number(void)
{
char* digits = (char*)malloc(sizeof(char) * 10);
int dsp = 0;
digits[dsp++] = ch;
next_char();
while (is_digit(ch)) {
digits[dsp++] = ch; next_char();
}
digits[dsp] = '\0';
Token.type = NUMB;
Token.seman = digits;
fp--;
}
void recognize_name(void)
{
char *name = (char*)malloc(sizeof(char)*10);
int np = 0;
name[np++] = ch;
next_char();
while (is_letter(ch) || is_digit(ch))
{
name[np++] = ch;
next_char();
}
name[np] = '\0';
if (!strcmp(name, "begin"))
Token.type = BEGIN;
else if (!strcmp(name, "end"))
Token.type = END;
else if (!strcmp(name, "read"))
Token.type = READ;
else if (!strcmp(name, "write"))
Token.type = WRITE;
else
Token.type = IDEN;
Token.seman = name;
fp--;
}
void next_token(void)
{
next_avail_char();
if (is_digit(ch))
{
recognize_number();
vec_Token.push_back(Token);
return;
}
if (is_letter(ch))
{
recognize_name();
vec_Token.push_back(Token);
return;
}
char *str = (char*)malloc(sizeof(char) * 10);
switch (ch) {
case '+':
strcpy(str, "+");
Token.seman = str;
Token.type = PLUS; break;
case '*':
strcpy(str, "*");
Token.seman = str;
Token.type = MULT; break;
case ':':
next_char();
if (ch != '=')
error();
strcpy(str, ":=");
Token.seman = str;
Token.type = ASS; break;
case ';':
strcpy(str, ";");
Token.seman = str;
Token.type = SEMI; break;
case '(':
strcpy(str, "(");
Token.seman = str;
Token.type = OPEN; break;
case ')':
strcpy(str, ")");
Token.seman = str;
Token.type = CLOSE; break;
case '\0':
strcpy(str, "EOF");
Token.seman = str;
Token.type = EOF;
break;
default:
error();
}
vec_Token.push_back(Token);
}
int print_hong(char *str,int label)
{
switch (label)
{
case BEGIN:
strcpy(str, "BEGIN");
return strlen("BEGIN");
break;
case NUMB:
strcpy(str, "NUMB");
return strlen("NUMB");
break;
case IDEN:
strcpy(str, "IDEN");
return strlen("IDEN");
break;
case PLUS:
strcpy(str, "PLUS");
return strlen("PLUS");
break;
case MULT:
strcpy(str, "MULT");
return strlen("MULT");
break;
case ASS:
strcpy(str, "ASS");
return strlen("ASS");
break;
case READ:
strcpy(str, "READ");
return strlen("READ");
break;
case WRITE:
strcpy(str, "WRITE");
return strlen("WRITE");
break;
case SEMI:
strcpy(str, "SEMI");
return strlen("SEMI");
break;
case OPEN:
strcpy(str, "OPEN");
return strlen("OPEN");
break;
case CLOSE:
strcpy(str, "CLOSE");
return strlen("CLOSE");
break;
case END:
strcpy(str, "END");
return strlen("END");
break;
case EOF:
strcpy(str, "EOF");
return strlen("EOF");
break;
}
}
void main()
{
initFile();
while (ch != '\0')
{
next_token();
}
for (int i = 0; i < vec_Token.size(); i++)
{
char buf[1024] = { 0 };
int len = 0;
len = sprintf(buf, "[%d]\t \(", i + 1);
len += print_hong(buf + len, vec_Token[i].type);
sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
printf("%s\n",buf);
free(vec_Token[i].seman);
}
}
上一篇: 分析器错误消息: 未能加载类型