c编译器的实现,利用flex实现字符串的识别,利用bison实现ast语法树的构建
程序员文章站
2024-01-02 14:43:40
无意中在github上发现一个很有意思的项目。它利用flex实现了字符串的识别,利用bison实现了ast语法树的构建,最后直接利用ast进行计算和识别。
比如,它的lex文件是...
无意中在github上发现一个很有意思的项目。它利用flex实现了字符串的识别,利用bison实现了ast语法树的构建,最后直接利用ast进行计算和识别。
比如,它的lex文件是这么写的,
/***********************************/ /* File: cmm.l version 2.0 */ /* Flex version for CMM */ /* CMM Interpreter Construction */ /***********************************/ %option noyywrap %{ #include "globals.h" #include "util.h" #include "scan.h" #include "cmm.tab.h" %} digit [0-9] int_num {digit}+ real_num {digit}+"."{digit}* letter [a-zA-Z] identifier {letter}+({letter}|{digit}|_)*({letter}+|{digit}+)|{letter}+ whitespace [ \t]* %% "if" { return IF;} "else" { return ELSE;} "while" { return WHILE;} "int" { yylval.dataType = Int; return INT;} "real" { yylval.dataType = Real; return REAL;} "bool" { yylval.dataType = Bool; return BOOL;} "read" { return READ;} "write" { return WRITE;} "main" { return MAIN;} {int_num} { yylval.intval = atoi(yytext); return INT_VALUE;} {real_num} { yylval.realval = atof(yytext); return REAL_VALUE;} {identifier} { yylval.idName = copyString(yytext); return ID;} {whitespace} {/*Do nothing.*/} "+" { return PLUS;} "-" { return SUB;} "*" { return MUL;} "/" { return DIV;} "<" { yylval.binaryOperator = 1; return REL_OP;} ">" { yylval.binaryOperator = 2; return REL_OP;} "<=" { yylval.binaryOperator = 3; return REL_OP;} ">=" { yylval.binaryOperator = 4; return REL_OP;} "==" { yylval.binaryOperator = 5; return REL_OP;} "<>" { yylval.binaryOperator = 6; return REL_OP;} "&&" { yylval.binaryOperator = 7; return REL_OP;} "||" { yylval.binaryOperator = 8; return REL_OP;} "=" { return ASSIGN;} "(" { return LPAREN;} ")" { return RPAREN;} ";" { return SEMI;} "{" { return LBRACE;} "}" { return RBRACE;} "[" { return LBRACKET;} "]" { return RBRACKET;} "," { return COMMA;} "//" { char c = input(); while(c != '\n') { if (c == EOF) break; c = input(); } lineno++; } "/*" { char c; int flag = 1; do { c = input(); entry1: if (c == EOF) break; if (c == '\n') lineno++; if (c == '*') { c = input(); if (c == '/') flag = 0; else goto entry1; } } while (flag); } \n {lineno++;} . {yyerror("Mystery character %s\n", yytext);return ERROR;} %% /* 用于语法分析时初始化词法分析接口 */ void iniLexer(void) { static int firstTime = TRUE; lineno = 0; if (firstTime) { firstTime = FALSE; lineno++; yyin = source; yyout = listing; } } /* 词法分析器专用 TokenType getToken(void) { static int firstTime = TRUE; TokenType currentToken; if (firstTime) { firstTime = FALSE; lineno++; yyin = source; yyout = listing; } currentToken = yylex(); strncpy(tokenString,yytext,MAXTOKENLEN); if (TraceScan) { fprintf(listing,"\t%d: ",lineno); printToken(currentToken); } return currentToken; }*/
它的yacc文件是这么写的,
/***********************************/ /* File: cmm.y */ /* Bison grammar file about CMM */ /* CMM Interpreter Construction */ /***********************************/ /*预计有1个移进/归约冲突*/ %expect 1 %{ #include "globals.h" #include "util.h" #include "scan.h" #include "parse.h" static TreeNode * savedTree; /* stores syntax tree for later return */ %} %union{ struct treeNode * node; int intval; double realval; char * idName; int binaryOperator; int dataType; } %token INT_VALUE %token REAL_VALUE %token ID %token INT REAL BOOL /* 优先级声明 */ %right ASSIGN %left PLUS SUB %left MUL DIV %nonassoc REL_OP %nonassoc UMINUS /* 声明文法中用到的tokens */ %token IF ELSE WHILE READ WRITE MAIN %token LPAREN RPAREN SEMI LBRACE RBRACE LBRACKET RBRACKET COMMA %token ASSIGN %token NEWLINE ERROR %type stmt_list stmt %type if_stmt decl_stmt compound_stmt while_stmt assign_stmt read_stmt write_stmt %type exp factor bin_exp %type type_spec %start program %% /* CMM文法 */ program : stmt_list { savedTree = $1;} ; stmt_list : { $$ = NULL; } | stmt_list stmt { TreeNode * t = $1; if (t != NULL) { while (t->sibling != NULL){ t = t->sibling;} t->sibling = $2; $$ = $1; } else $$ = $2; } ; stmt : if_stmt { $$ = $1; } | decl_stmt SEMI { $$ = $1; } | compound_stmt { $$ = $1; } | while_stmt { $$ = $1; } | assign_stmt SEMI { $$ = $1; } | read_stmt SEMI { $$ = $1; } | write_stmt SEMI { $$ = $1; } | error { $$ = NULL; } ; compound_stmt : LBRACE stmt_list RBRACE { $$ = newStmtNode(CompoundK); $$->child[0] = $2; $$->lineno = lineno; } ; decl_stmt : type_spec ID { $$ = newStmtNode(DeclK); $$->attr.name = $2; /* 数组长度为0代表非数组 */ $$->arrayLength = 0; $$->type = $1; $$->lineno = lineno; } | type_spec ID LBRACKET INT_VALUE RBRACKET { $$ = newStmtNode(DeclK); $$->attr.name = $2; $$->arrayLength = $4; $$->type = $1; if($$->type == Int) { int int_array_temp[$4]; $$->array.intArray = int_array_temp; } else if($$->type == Real) { double real_array_temp[$4]; $$->array.realArray = real_array_temp; } $$->lineno = lineno; } ; type_spec : INT { $$ = $1; } | REAL { $$ = $1; } | BOOL { $$ = $1; } ; if_stmt : IF LPAREN bin_exp RPAREN stmt { $$ = newStmtNode(IfK); $$->child[0] = $3; $$->child[1] = $5; $$->child[2] = NULL; $$->lineno = lineno; } | IF LPAREN bin_exp RPAREN stmt ELSE stmt { $$ = newStmtNode(IfK); $$->child[0] = $3; $$->child[1] = $5; $$->child[2] = $7; $$->lineno = lineno; } ; while_stmt : WHILE LPAREN bin_exp RPAREN stmt { $$ = newStmtNode(WhileK); $$->child[0] = $3; $$->child[1] = $5; $$->lineno = lineno; } ; assign_stmt : ID ASSIGN exp { $$ = newStmtNode(AssignK); $$->attr.name = $1; $$->child[0] = $3; $$->child[1] = NULL; $$->lineno = lineno; } | ID LBRACKET exp RBRACKET ASSIGN exp { $$ = newStmtNode(AssignK); $$->attr.name = $1; /* child[1]不为NULL 表示引用的变量是一个数组元素 */ /* child[0] 是数组元素的下标 */ $$->child[0] = $3; $$->child[1] = $6; $$->lineno = lineno; } ; read_stmt : READ LPAREN ID RPAREN { $$ = newStmtNode(ReadK); $$->attr.name = $3; $$->child[0] = NULL; $$->lineno = lineno; } | READ LPAREN ID LBRACKET exp RBRACKET RPAREN { $$ = newStmtNode(ReadK); $$->attr.name = $3; $$->child[0] = $5; $$->lineno = lineno; } ; write_stmt : WRITE LPAREN exp RPAREN { $$ = newStmtNode(WriteK); $$->child[0] = $3; $$->lineno = lineno; } ; exp : factor { $$ = $1; } | bin_exp { $$ = $1; } ; factor : INT_VALUE { $$ = newExpNode(IntValueK); $$->value.int_val = $1; $$->type = Int; $$->lineno = lineno; } | REAL_VALUE { $$ = newExpNode(RealValueK); $$->value.real_val = $1; $$->type = Real; $$->lineno = lineno; } | LPAREN exp RPAREN { $$ = $2; } | ID { $$ = newExpNode(IdK); $$->attr.name = $1; /* child[0]为NULL代表该变量不是数组,用作解释时 */ $$->child[0] = NULL; $$->lineno = lineno; } | ID LBRACKET exp RBRACKET { $$ = newExpNode(IdK); $$->attr.name = $1; $$->child[0] = $3; $$->lineno = lineno; } | error { $$ = NULL; } ; bin_exp : /* 关系运算符 */ exp REL_OP exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = $2; $$->lineno = lineno; } /* 算数运算符 */ | exp PLUS exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = PLUS; $$->lineno = lineno; } | exp SUB exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = SUB; $$->lineno = lineno; } | exp MUL exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = MUL; $$->lineno = lineno; } | exp DIV exp { $$ = newExpNode(OpK); $$->child[0] = $1; $$->child[1] = $3; $$->attr.op = DIV; $$->lineno = lineno; } | SUB exp %prec UMINUS { $$ = newExpNode(OpK); $$->child[0] = $2; $$->child[1] = NULL; $$->attr.op = UMINUS; $$->lineno = lineno; } ; %% int yyerror(char * message) { fprintf(listing,"Syntax error at line %d: %s\n",lineno,message); fprintf(listing,"Current token: %s",tokenString); printToken(yychar); Error = TRUE; return 0; } /* 与主函数交互的语法分析函数 */ TreeNode * parse(void) { iniLexer(); yyparse(); return savedTree; }
编译也非常简单,
#The makefile for interpreter based on Flex and Bison CC = gcc FLAG = -w interpreter: main.o util.o cmm.tab.o cmm.lex.o symtab.o analyze.o $(CC) $(FLAG) -o ../bin\&test/interpreter main.o util.o cmm.tab.o cmm.lex.o symtab.o analyze.o main.o: main.c parse.h analyze.h $(CC) $(FLAG) -c main.c cmm.tab.o:cmm.tab.c parse.h $(CC) $(FLAG) -c cmm.tab.c cmm.tab.c:cmm.y bison -d cmm.y cmm.lex.o:cmm.lex.c cmm.tab.h $(CC) $(FLAG) -c cmm.lex.c cmm.lex.c:cmm.l flex -o cmm.lex.c cmm.l symtab.o:symtab.c symtab.h globals.h $(CC) $(FLAG) -c symtab.c analyze.o:analyze.c globals.h symtab.h analyze.h $(CC) $(FLAG) -c analyze.c util.o:util.c $(CC) $(FLAG) -c util.c clean: -rm *.o
最后测试的时候,就是直接利用./interpreter [file_name]就可以了。
有兴趣的同学可以把这个项目下载下来,多看看,了解一下如果利用flex&bison快速实现一个脚本语言。