不知不觉，写了一个编译器（一）

程序员文章站 2022-07-12 12:31:35
...
首先，假如我们自己建立了一门语言，如何才能让它能够编译运行呢。
第一步，我们要识别语言中的字符串，他们可能是：关键字，字符，数字等。因此，我们要进行词法分析（LexicalAnalysis）。比如下列表格：我们要将它们归类，并告诉我们的编译器，这些是什么。
不知不觉，写了一个编译器（一）
我们在这里可以使用自动状态机的模型，当scan 一个字符时，每次进行判断是否构成关键字，或者数字，或者仅仅是一个字符串，或者是运算符。我们在代码里将他们进行分类，为后面的编译服务：
不知不觉，写了一个编译器（一）
然后，我们就可以动手写一个scanning的program了，假设我们的语言如下：
不知不觉，写了一个编译器（一）
进而，我们针对每一个字符判断即可。下面，根据代码来讲解：
import java.util.*;

public class Scanner {

	@SuppressWarnings("serial")
	public static class LexicalException extends Exception {

		int pos;

		public LexicalException(String message, int pos) {
			super(message);
			this.pos = pos;
		}

		public int getPos() {
			return pos;
		}

	}

	public static enum Kind {
		IDENTIFIER, INTEGER_LITERAL, BOOLEAN_LITERAL, STRING_LITERAL, 
		KW_x/* x */, KW_X/* X */, KW_y/* y */, KW_Y/* Y */, KW_r/* r */, 
		KW_R/* R */, KW_a/* a */, KW_A/* A */, KW_Z/* Z */, KW_DEF_X/* DEF_X */, 
		KW_DEF_Y/* DEF_Y */, KW_SCREEN/* SCREEN */, KW_cart_x/* cart_x */, KW_cart_y/* cart_y */,
		KW_polar_a/* polar_a */, KW_polar_r/* polar_r */, KW_abs/* abs */, KW_sin/* sin */,
		KW_cos/* cos */, KW_atan/* atan */, KW_log/* log */, KW_image/* image */, KW_int/* int */,
		KW_boolean/* boolean */, KW_url/* url */, KW_file/* file */, OP_ASSIGN/* = */, OP_GT/* > */,
		OP_LT/* < */, OP_EXCL/* ! */, OP_Q/* ? */, OP_COLON/* : */, OP_EQ/* == */, OP_NEQ/* != */, 
		OP_GE/* >= */, OP_LE/* <= */, OP_AND/* & */, OP_OR/* | */, OP_PLUS/* + */, OP_MINUS/* - */,
		OP_TIMES/* * */, OP_DIV/* / */, OP_MOD/* % */, OP_POWER/* ** */, OP_AT/* @ */,
		OP_RARROW/* -> */, OP_LARROW/* <- */, LPAREN/* ( */, RPAREN/* ) */, LSQUARE/* [ */, 
		RSQUARE/* ] */, SEMI/* ; */, COMMA/* , */, EOF;
	}
	HashMap<String,Kind> map_keywords=new HashMap<String,Kind>();//建立关键词的表
	HashMap<Character,Kind> map_op=new HashMap<Character,Kind>();//建立运算符的表


	public static enum State {
		START, IN_DIGIT, IN_IDENT, IN_STRING, IN_COMMENT;
	}//这个是我们的状态机的五个状态：起始，数字，变量，字母，评论

	public class Token {
		public final Kind kind;
		public final int pos;
		public final int length;
		public final int line;
		public final int pos_in_line;//每一个token(指对象)，有五个属性
		public Token(Kind kind, int pos, int length, int line, int pos_in_line) {
			super();
			this.kind = kind;
			this.pos = pos;
			this.length = length;
			this.line = line;
			this.pos_in_line = pos_in_line;
		}

		public String getText() {
			if (kind == Kind.STRING_LITERAL) {
				return chars2String(chars, pos, length);
			} else
				return String.copyValueOf(chars, pos, length);
		}

		/**
		 * To get the text of a StringLiteral, we need to remove the enclosing "
		 * characters and convert escaped characters to the represented
		 * character. For example the two characters \ t in the char array
		 * should be converted to a single tab character in the returned String
		 * 
		 * @param chars
		 * @param pos
		 * @param length
		 * @return
		 */
		private String chars2String(char[] chars, int pos, int length) {
			StringBuilder sb = new StringBuilder();
			for (int i = pos + 1; i < pos + length - 1; ++i) {// omit initial
																// and final "
				char ch = chars[i];
				if (ch == '\\') { // handle escape
					i++;
					ch = chars[i];
					switch (ch) {
					case 'b':
						sb.append('\b');
						break;
					case 't':
						sb.append('\t');
						break;
					case 'f':
						sb.append('\f');
						break;
					case 'r':
						sb.append('\r'); // for completeness, line termination
											// chars not allowed in String
											// literals
						break;
					case 'n':
						sb.append('\n'); // for completeness, line termination
											// chars not allowed in String
											// literals
						break;
					case '\"':
						sb.append('\"');
						break;
					case '\'':
						sb.append('\'');
						break;
					case '\\':
						sb.append('\\');
						break;
					default:
						assert false;
						break;
					}
				} else {
					sb.append(ch);
				}
			}
			return sb.toString();
		}

		/**
		 * precondition: 这是一个数字
		 * 
		 * 返回数字值
		 */
		public int intVal() {
			assert kind == Kind.INTEGER_LITERAL;
			return Integer.valueOf(String.copyValueOf(chars, pos, length));
		}

		public String toString() {
			return "[" + kind + "," + String.copyValueOf(chars, pos, length) + "," + pos + "," + length + "," + line
					+ "," + pos_in_line + "]";
		}

		/**
		 * Since we overrode equals, we need to override hashCode.
		 * https://docs.oracle.com/javase/8/docs/api/java/lang/Object.html#equals-java.lang.Object-
		 * 
		 * Both the equals and hashCode method were generated by eclipse
		 * 
		 */
		@Override
		public int hashCode() {
			final int prime = 31;
			int result = 1;
			result = prime * result + getOuterType().hashCode();
			result = prime * result + ((kind == null) ? 0 : kind.hashCode());
			result = prime * result + length;
			result = prime * result + line;
			result = prime * result + pos;
			result = prime * result + pos_in_line;
			return result;
		}

		/*
		 建立自己的equal函数，用于覆盖map的cha'zh
		 * 
		 */
		@Override
		public boolean equals(Object obj) {
			if (this == obj)
				return true;
			if (obj == null)
				return false;
			if (getClass() != obj.getClass())
				return false;
			Token other = (Token) obj;
			if (!getOuterType().equals(other.getOuterType()))
				return false;
			if (kind != other.kind)
				return false;
			if (length != other.length)
				return false;
			if (line != other.line)
				return false;
			if (pos != other.pos)
				return false;
			if (pos_in_line != other.pos_in_line)
				return false;
			return true;
		}

		/**
		 * used in equals to get the Scanner object this Token is associated
		 * with.
		 * 
		 * @return
		 */
		private Scanner getOuterType() {
			return Scanner.this;
		}

	}

	/**
	 * Extra character added to the end of the input characters to simplify the
	 * Scanner.
	 */
	static final char EOFchar = 0;

	/**
	 * The list of tokens created by the scan method.
	 */
	final ArrayList<Token> tokens;

	/**
	 * An array of characters representing the input. These are the characters
	 * from the input string plus and additional EOFchar at the end.
	 */
	final char[] chars;

	/**
	 * position of the next token to be returned by a call to nextToken
	 */
	private int nextTokenPos = 0;

	Scanner(String inputString) {
		int numChars = inputString.length();
		this.chars = Arrays.copyOf(inputString.toCharArray(), numChars + 1); // input
		chars[numChars] = EOFchar;
		tokens = new ArrayList<Token>();
	}

	/**
	 * Method to scan the input and create a list of Tokens.
	 * 
	 * If an error is encountered during scanning, throw a LexicalException.
	 * 
	 * @return
	 * @throws LexicalException
	 */
	
	public Scanner scan() throws LexicalException {
		/* 定义我们的关键词*/
		map_keywords.put("x",Kind.KW_x);
		map_keywords.put("X",Kind.KW_X);
		map_keywords.put("y",Kind.KW_y);
		map_keywords.put("Y",Kind.KW_Y);
		map_keywords.put("r",Kind.KW_r);
		map_keywords.put("R",Kind.KW_R);
		map_keywords.put("a",Kind.KW_a);
		map_keywords.put("A",Kind.KW_A);
		map_keywords.put("Z",Kind.KW_Z);
		map_keywords.put("DEF_X",Kind.KW_DEF_X);
		map_keywords.put("DEF_Y",Kind.KW_DEF_Y);
		map_keywords.put("SCREEN",Kind.KW_SCREEN);
		map_keywords.put("cart_x",Kind.KW_cart_x);
		map_keywords.put("cart_y",Kind.KW_cart_y);
		map_keywords.put("polar_a",Kind.KW_polar_a);
		map_keywords.put("polar_r",Kind.KW_polar_r);
		map_keywords.put("abs",Kind.KW_abs);
		map_keywords.put("sin",Kind.KW_sin);
		map_keywords.put("cos",Kind.KW_cos);
		map_keywords.put("atan",Kind.KW_atan);
		map_keywords.put("log",Kind.KW_log);
		map_keywords.put("image",Kind.KW_image);
		map_keywords.put("int",Kind.KW_int);
		map_keywords.put("boolean",Kind.KW_boolean);
		map_keywords.put("url",Kind.KW_url);
		map_keywords.put("file",Kind.KW_file);
		map_op.put('?', Kind.OP_Q);
		map_op.put(':', Kind.OP_COLON);
		map_op.put('&', Kind.OP_AND);
		map_op.put('|', Kind.OP_OR);
		map_op.put('+', Kind.OP_PLUS);
		map_op.put('@', Kind.OP_AT);
		map_op.put('%', Kind.OP_MOD);
		map_op.put(';', Kind.SEMI);
		map_op.put(',', Kind.COMMA);
		map_op.put('(', Kind.LPAREN);
		map_op.put(')', Kind.RPAREN);
		map_op.put('[', Kind.LSQUARE);
		map_op.put(']', Kind.RSQUARE);
	    int pos = 0;
	    int line = 1;
	    int sum=0;
	    String temp="";
	    State state = State.START;
	    int startPos = 1; 
	    int pos_temp=0;//自动状态机开始，在其中定义好每一个字符扫入后的状态转移路线。笨一点就用switch_case实现就可以了		
	    while (pos <= chars.length) {
	        char ch=EOFchar;
	        switch (state) {
	            case START: { //开始态
	            	ch = pos < chars.length ? chars[pos] : EOFchar;
	            	switch (ch) {
	                case '=': {
	                	if(chars[pos+1]=='='){
	                		tokens.add(new Token(Kind.OP_EQ,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                		}
	                	else{
	                		tokens.add(new Token(Kind.OP_ASSIGN,pos, 1,line,startPos));
	                		pos++;startPos++;
	                		}
	                	}break;
	                case '-': {
	                	if(chars[pos+1]=='>'){
	                		tokens.add(new Token(Kind.OP_RARROW,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_MINUS,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '>': {
	                	if(chars[pos+1]=='='){
	                		tokens.add(new Token(Kind.OP_GE,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_GT,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '!': {
	                	if(chars[pos+1]=='='){
	                		tokens.add(new Token(Kind.OP_NEQ,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_EXCL,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '<': {
	                	if(chars[pos+1]=='='){
	                		tokens.add(new Token(Kind.OP_LE,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else if(chars[pos+1]=='-'){
	                		tokens.add(new Token(Kind.OP_LARROW,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_LT,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '*': {
	                	if(chars[pos+1]=='*'){
	                		tokens.add(new Token(Kind.OP_POWER,pos, 2,line,startPos));
	                		pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_TIMES,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '/': {
	                	if(chars[pos+1]=='/'){
	                		state=State.IN_COMMENT;
	                		pos_temp=pos;pos+=2;startPos+=2;
	                	}
	                	else{
	                		tokens.add(new Token(Kind.OP_DIV,pos, 1,line,startPos));
	                		pos++;startPos++;
	                	}
	                }break;
	                case '\r':{
	                	if(chars[pos+1]=='\n'){
	                		line++;startPos=1;pos+=2;
	                	}
	                	else{
	                		line++;pos++;startPos=1;
	                	}
	                }break;
	                case '0': {tokens.add(new Token(Kind.INTEGER_LITERAL,pos, 1,line,startPos));pos++;startPos++;}break;
	                case '\n':{line++;pos++;startPos=1;}break;
	                case '\"':{state=State.IN_STRING;pos_temp=pos;pos++;}break;
	                case EOFchar:{pos++;}break;
	                default: {
	                	if(map_op.containsKey(ch)){tokens.add(new Token(map_op.get(ch), pos, 1,line,startPos));pos++;startPos++;} 
	                	else if (Character.isDigit(ch)) {sum=0;state = State.IN_DIGIT;pos_temp=pos;} 
	                    else if (Character.isJavaIdentifierStart(ch)) {
	                         state = State.IN_IDENT;pos_temp=pos;temp="";
	                    } 
	                    else if (Character.isWhitespace(ch)){
	                    	 pos++;startPos++;
	                    }
	                    else {throw new LexicalException("Illegal Character",pos); }
	                  }
	                }
	            	// switch (ch)
	            }  break;
	            case IN_DIGIT: {//数字态
	            	ch = pos < chars.length ? chars[pos] : EOFchar;
	            	if(Character.isDigit(ch)){temp+=ch;pos++;}
	            	else{
	            		for (int i = 0; i < temp.length(); i++) {   
	                        sum  = sum * 10 + (temp.charAt(i) - '0');
	                        if (sum < 0)  throw new LexicalException("Integeal too big!",pos_temp);
	                    }
	            		tokens.add(new Token(Kind.INTEGER_LITERAL, pos_temp, pos - pos_temp,line,startPos));
	            		startPos+=pos-pos_temp;
	                    pos_temp=0;
	                    temp="";
	            		state=State.START;
	            		
	            	}
	            	
	            }break;
	            case IN_IDENT: {//变量
	            	ch = pos < chars.length ? chars[pos] : EOFchar;	
	            	if (Character.isLetterOrDigit(ch) || ch=='$' || ch=='_') {
	            		temp+=ch;
	            		pos++;
	            		}
	            	else {
	            		
	            		if(map_keywords.containsKey(temp)){
	            			tokens.add(new Token(map_keywords.get(temp), pos_temp, pos - pos_temp,line,startPos));
	            		}
	            		else if(temp.equals("true")||temp.equals("false")){
	            			tokens.add(new Token(Kind.BOOLEAN_LITERAL, pos_temp, pos - pos_temp,line,startPos));
	            			
	            		}
	            		else{
	                    tokens.add(new Token(Kind.IDENTIFIER, pos_temp, pos - pos_temp,line,startPos));
	            		}
	                    state = State.START;
	                    startPos+=pos-pos_temp;
	                    temp="";
	                    pos_temp=0;
	                    }  
	            }  break;
	            case IN_STRING: {//字符串
	            	ch = pos < chars.length ? chars[pos] : EOFchar;
	            	//System.out.println(ch);
	            	if (ch=='\n'||ch=='\t'||ch=='\f'||ch=='\r'){
	            		throw new LexicalException("Illegal Character for String Literal",pos);	
	            	}
	            	else if(ch=='\\'){
	            		if(chars[pos+1]=='n'||chars[pos+1]=='t'||chars[pos+1]=='b'||chars[pos+1]=='f'||chars[pos+1]=='r'||chars[pos+1]=='\"'||chars[pos+1]=='\''||chars[pos+1]=='\\'){
	            			temp=temp+ch+chars[pos+1];pos+=2;
	            		}
	            		else{
	            			throw new LexicalException("Illegal Character for String Literal",pos+1);	
	            		}
	            	}
	            	else if (ch!=EOFchar&&ch!='\"') {
	            		temp+=ch;
	            		pos++;
	            		}
	            	else {
	            		if(ch!='\"') {throw new LexicalException("Missing \" in the end of String",pos);}
	            		else{
	                    tokens.add(new Token(Kind.STRING_LITERAL, pos_temp, pos - pos_temp+1,line,startPos));
	                    pos++;
	                    state = State.START;
	                    startPos+=pos-pos_temp;
	                    temp="";}
	                    }  
	            }  break;
	            case IN_COMMENT: {//评论
	            	ch = pos < chars.length ? chars[pos] : EOFchar;
	            	if (ch!='\n'&&ch!='\r'&&ch!=EOFchar) {
	            		pos++;
	            		}
	            	else {
	            		state=State.START;
	                    startPos+=pos-pos_temp;
	                    }
	            }  break;
	            default:  throw new LexicalException("ILLEGAL CHARACTER",pos); 
	        }// switch(state)
	    } // while
	    tokens.add(new Token(Kind.EOF, pos-2, 0,line,startPos));
	    return this;
	}

	/**
	 * Returns true if the internal interator has more Tokens
	 * 
	 * @return
	 */
	public boolean hasTokens() {
		return nextTokenPos < tokens.size();
	}

	/**
	 * Returns the next Token and updates the internal iterator so that the next
	 * call to nextToken will return the next token in the list.
	 * 
	 * It is the callers responsibility to ensure that there is another Token.
	 * 
	 * Precondition: hasTokens()
	 * 
	 * @return
	 */
	public Token nextToken() {
		return tokens.get(nextTokenPos++);
	}

	/**
	 * Returns the next Token, but does not update the internal iterator. This
	 * means that the next call to nextToken or peek will return the same Token
	 * as returned by this methods.
	 * 
	 * It is the callers responsibility to ensure that there is another Token.
	 * 
	 * Precondition: hasTokens()
	 * 
	 * @return next Token.
	 */
	public Token peek() {
		return tokens.get(nextTokenPos);
	}

	/**
	 * Resets the internal iterator so that the next call to peek or nextToken
	 * will return the first Token.
	 */
	public void reset() {
		nextTokenPos = 0;
	}

	/**
	 * Returns a String representation of the list of Tokens
	 */
	public String toString() {
		StringBuffer sb = new StringBuffer();
		sb.append("Tokens:\n");
		System.out.println(tokens.size());
		for (int i = 0; i < tokens.size(); i++) {
			sb.append(tokens.get(i)).append('\n');
		}
		return sb.toString();
	}

}
上一篇： shell脚本实战2
下一篇： Spring AMQP 整合RabbitMQ是如何消费消息的
不知不觉，写了一个编译器（一）

程序员写了一个新手都写不出的低级bug，被骂惨了。

学习较底层编程：动手写一个C语言编译器

默默简单的写了一个模板引擎

使用C#写了一个可以推算火车票身份证号码的小程序

想起以前写的一个爬虫，然后就用C#WinForm写了一个下载小说的软件，比较简单

自己写了一个php检测文件编码的函数

用css3+js写了一个钟表

Python 写了一个 PDF 转换器，以后再也不用花钱转了

python开发一个解析protobuf文件的简单编译器

NDK clang编译器的一个bug