欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

词法分析器的实现

程序员文章站 2022-05-28 17:05:58
...

文章链接:https://codemouse.online/archives/2020-03-13175439

词法分析器

结构体

typedef struct {
	int type; // 词语类型
	char *seman; // 词语
}Token_type;

类型定义

#define BEGIN	1	/* 定义begin的宏名*/
#define NUMB	2	/* 定义num.的宏名*/
#define IDEN	3	/* 定义id的宏名*/
#define PLUS	4	/* 定义'+'的宏名*/
#define MULT	5	/* 定义'*'的宏名*/
#define ASS		6	/* 定义':='的宏名*/
#define READ	7	/* 定义read的宏名:*/
#define WRITE	8	/* 定义write的宏名*/
#define SEMI	9	/* 定义';'的宏名.*/
#define OPEN	10	/* 定义'('的宏名*/
#define CLOSE	11	/* 定义') '的宏名*/
#define END		12	/* 定义end的宏名*/
#define EOF		13	/* 文件结束符的宏名*/

判断字符类型

#define is_end_of_input(ch)		((ch) == '\0')
#define is_lc_letter(ch)		('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch)		('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch)			('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch)			('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch)	(is_letter(ch) || is_digit (ch))
#define is_operator(ch)			((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch)			(!is_end_of_input(ch) && (ch) <= ' ')

全局变量

Token_type Token; //作为临时词语接收变量
char prog_file[4096] = { 0 }; // 文件内的所有文字
int fp = -1; // 字符下标
char ch; // 取出的字符
vector<Token_type> vec_Token; // 接收词语的容器

获取下一个字符

void next_char(void) { 
	ch = prog_file[++fp]; 
}

去除无用字符

void next_avail_char(void) {
	next_char();
	while (is_layout(ch)) {
		next_char();
	}
}

获取程序代码

void initFile()
{
	char *p = prog_file;
	FILE *file = fopen("toy.toyL", "r");
	if (file == NULL)
	{
		perror("open file fail\n");
		exit(-1);
	}
	int fileLen = 0;
	fseek(file, 0, SEEK_END);
	fileLen = ftell(file);
	fseek(file, 0, SEEK_SET);
	for (int i = 0; i < fileLen -1; i++)
	{
		*p++ = fgetc(file);
	}
	ch = prog_file[0];
	printf("%s\n", prog_file);
	fclose(file);
}

解析字符串

void recognize_name(void) 
{
	char *name = (char*)malloc(sizeof(char)*10);
	int np = 0; 
	name[np++] = ch;
	next_char();
	while (is_letter(ch) || is_digit(ch)) 
	{
		name[np++] = ch; 
		next_char();
	}
	name[np] = '\0';
    // 判断是变量还是关键字
	if (!strcmp(name, "begin"))
		Token.type = BEGIN;
	else if (!strcmp(name, "end"))
		Token.type = END;
	else if (!strcmp(name, "read"))
		Token.type = READ;
	else if (!strcmp(name, "write"))
		Token.type = WRITE;
	else
		Token.type = IDEN;
	Token.seman = name;
	fp--;
}

解析数字

void recognize_number(void) 
{
	char* digits = (char*)malloc(sizeof(char) * 10);
	int dsp = 0; 
	digits[dsp++] = ch;
	next_char();
	while (is_digit(ch)) {
		digits[dsp++] = ch; next_char();
	}
	digits[dsp] = '\0';
	Token.type = NUMB; 
	Token.seman = digits;
	fp--;
}

设置好词语结构体

void next_token(void)
{
	next_avail_char();
	if (is_digit(ch))
	{
		recognize_number();
		vec_Token.push_back(Token);
		return;
	}
		
	if (is_letter(ch))
	{
		recognize_name();
		vec_Token.push_back(Token);
		return;
	}
	char *str = (char*)malloc(sizeof(char) * 10);
	switch (ch) {
	case '+':
		strcpy(str, "+");
		Token.seman = str;
		Token.type = PLUS; break;
	case '*':
		strcpy(str, "*");
		Token.seman = str;
		Token.type = MULT; break;
	case ':':
		next_char();
		if (ch != '=')
			error();
		strcpy(str, ":=");
		Token.seman = str;
		Token.type = ASS; break;
	case ';':
		strcpy(str, ";");
		Token.seman = str;
		Token.type = SEMI; break;
	case '(':
		strcpy(str, "(");
		Token.seman = str;
		Token.type = OPEN; break;
	case ')':
		strcpy(str, ")");
		Token.seman = str;
		Token.type = CLOSE; break;
	case '\0':
		strcpy(str, "EOF");
		Token.seman = str;
		Token.type = EOF;
		break;
	default:
		error();
	}
	vec_Token.push_back(Token);
}

将宏写入字符串

int print_hong(char *str,int label)
{
	switch (label)
	{

	case BEGIN:
		strcpy(str, "BEGIN");
		return strlen("BEGIN");
		break;
	case NUMB:
		strcpy(str, "NUMB");
		return strlen("NUMB");
		break;

	case IDEN:
		strcpy(str, "IDEN");
		return strlen("IDEN");
		break;

	case PLUS:
		strcpy(str, "PLUS");
		return strlen("PLUS");
		break;

	case MULT:
		strcpy(str, "MULT");
		return strlen("MULT");
		break;

	case ASS:
		strcpy(str, "ASS");
		return strlen("ASS");
		break;

	case READ:
		strcpy(str, "READ");
		return strlen("READ");
		break;

	case WRITE:
		strcpy(str, "WRITE");
		return strlen("WRITE");
		break;

	case SEMI:
		strcpy(str, "SEMI");
		return strlen("SEMI");
		break;

	case OPEN:
		strcpy(str, "OPEN");
		return strlen("OPEN");
		break;

	case CLOSE:
		strcpy(str, "CLOSE");
		return strlen("CLOSE");
		break;

	case END:
		strcpy(str, "END");
		return strlen("END");
		break;

	case EOF:
		strcpy(str, "EOF");
		return strlen("EOF");
		break;
	}

}

主函数运行,将词语分析后打印

void main()
{
	initFile();
	while (ch != '\0')
	{
		next_token();
	}
	for (int i = 0; i < vec_Token.size(); i++)
	{
		char buf[1024] = { 0 };
		int len = 0;
		len = sprintf(buf, "[%d]\t \(", i + 1);
		len += print_hong(buf + len, vec_Token[i].type);
		sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
		printf("%s\n",buf);
		free(vec_Token[i].seman);
	}
}

完整代码

// 练习.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
using namespace std;

typedef struct {
	int type;
	char *seman;
}Token_type;



#define BEGIN	1	/* 定义begin的宏名*/
#define NUMB	2	/* 定义num.的宏名*/
#define IDEN	3	/* 定义id的宏名*/
#define PLUS	4	/* 定义'+'的宏名*/
#define MULT	5	/* 定义'*'的宏名*/
#define ASS		6	/* 定义':='的宏名*/
#define READ	7	/* 定义read的宏名:*/
#define WRITE	8	/* 定义write的宏名*/
#define SEMI	9	/* 定义';'的宏名.*/
#define OPEN	10	/* 定义'('的宏名*/
#define CLOSE	11	/* 定义') '的宏名*/
#define END		12	/* 定义end的宏名*/
#define EOF		13	/* 文件结束符的宏名*/



#define is_end_of_input(ch)		((ch) == '\0')
#define is_lc_letter(ch)		('a' <= (ch) && (ch) <= 'z')
#define is_uc_letter(ch)		('A' <= (ch) && (ch) <= 'Z')
#define is_letter(ch)			('A' <= (ch) && (ch) <= 'Z' || 'a' <= (ch) && (ch) <= 'z')
#define is_digit(ch)			('0' <= (ch) && (ch) <= '9')
#define is_letter_or_digit(ch)	(is_letter(ch) || is_digit (ch))
#define is_operator(ch)			((ch) == '+'|| (ch) == '-' || (ch) == '*')
#define is_layout(ch)			(!is_end_of_input(ch) && (ch) <= ' ')



Token_type Token;
char prog_file[4096] = { 0 };
int fp = -1;
char ch;
vector<Token_type> vec_Token;


void error()
{
	exit(-2);
}

void next_char(void) { 
	ch = prog_file[++fp]; 
}

void next_avail_char(void) {
	next_char();
	while (is_layout(ch)) {
		next_char();
	}
}


// 先获取代码
void initFile()
{
	char *p = prog_file;
	FILE *file = fopen("toy.toyL", "r");
	if (file == NULL)
	{
		perror("open file fail\n");
		exit(-1);
	}
	int fileLen = 0;
	fseek(file, 0, SEEK_END);
	fileLen = ftell(file);
	fseek(file, 0, SEEK_SET);
	for (int i = 0; i < fileLen -1; i++)
	{
		*p++ = fgetc(file);
	}
	ch = prog_file[0];
	printf("%s\n", prog_file);
	fclose(file);
}

void recognize_number(void) 
{
	char* digits = (char*)malloc(sizeof(char) * 10);
	int dsp = 0; 
	digits[dsp++] = ch;
	next_char();
	while (is_digit(ch)) {
		digits[dsp++] = ch; next_char();
	}
	digits[dsp] = '\0';
	Token.type = NUMB; 
	Token.seman = digits;
	fp--;
}

void recognize_name(void) 
{
	char *name = (char*)malloc(sizeof(char)*10);
	int np = 0; 
	name[np++] = ch;
	next_char();
	while (is_letter(ch) || is_digit(ch)) 
	{
		name[np++] = ch; 
		next_char();
	}
	name[np] = '\0';
	if (!strcmp(name, "begin"))
		Token.type = BEGIN;
	else if (!strcmp(name, "end"))
		Token.type = END;
	else if (!strcmp(name, "read"))
		Token.type = READ;
	else if (!strcmp(name, "write"))
		Token.type = WRITE;
	else
		Token.type = IDEN;
	Token.seman = name;
	fp--;
}

void next_token(void)
{
	next_avail_char();
	if (is_digit(ch))
	{
		recognize_number();
		vec_Token.push_back(Token);
		return;
	}
		
	if (is_letter(ch))
	{
		recognize_name();
		vec_Token.push_back(Token);
		return;
	}
	char *str = (char*)malloc(sizeof(char) * 10);
	switch (ch) {
	case '+':
		strcpy(str, "+");
		Token.seman = str;
		Token.type = PLUS; break;
	case '*':
		strcpy(str, "*");
		Token.seman = str;
		Token.type = MULT; break;
	case ':':
		next_char();
		if (ch != '=')
			error();
		strcpy(str, ":=");
		Token.seman = str;
		Token.type = ASS; break;
	case ';':
		strcpy(str, ";");
		Token.seman = str;
		Token.type = SEMI; break;
	case '(':
		strcpy(str, "(");
		Token.seman = str;
		Token.type = OPEN; break;
	case ')':
		strcpy(str, ")");
		Token.seman = str;
		Token.type = CLOSE; break;
	case '\0':
		strcpy(str, "EOF");
		Token.seman = str;
		Token.type = EOF;
		break;
	default:
		error();
	}
	vec_Token.push_back(Token);
}




int print_hong(char *str,int label)
{
	switch (label)
	{

	case BEGIN:
		strcpy(str, "BEGIN");
		return strlen("BEGIN");
		break;
	case NUMB:
		strcpy(str, "NUMB");
		return strlen("NUMB");
		break;

	case IDEN:
		strcpy(str, "IDEN");
		return strlen("IDEN");
		break;

	case PLUS:
		strcpy(str, "PLUS");
		return strlen("PLUS");
		break;

	case MULT:
		strcpy(str, "MULT");
		return strlen("MULT");
		break;

	case ASS:
		strcpy(str, "ASS");
		return strlen("ASS");
		break;

	case READ:
		strcpy(str, "READ");
		return strlen("READ");
		break;

	case WRITE:
		strcpy(str, "WRITE");
		return strlen("WRITE");
		break;

	case SEMI:
		strcpy(str, "SEMI");
		return strlen("SEMI");
		break;

	case OPEN:
		strcpy(str, "OPEN");
		return strlen("OPEN");
		break;

	case CLOSE:
		strcpy(str, "CLOSE");
		return strlen("CLOSE");
		break;

	case END:
		strcpy(str, "END");
		return strlen("END");
		break;

	case EOF:
		strcpy(str, "EOF");
		return strlen("EOF");
		break;
	}

}

void main()
{
	initFile();
	while (ch != '\0')
	{
		next_token();
	}
	for (int i = 0; i < vec_Token.size(); i++)
	{
		char buf[1024] = { 0 };
		int len = 0;
		len = sprintf(buf, "[%d]\t \(", i + 1);
		len += print_hong(buf + len, vec_Token[i].type);
		sprintf(buf+len, " ,\"%s\"\)", vec_Token[i].seman);
		printf("%s\n",buf);
		free(vec_Token[i].seman);
	}
}