编译原理(三、词法分析)
程序员文章站
2024-03-03 20:27:04
...
编译程序是在单词的级别上来分析和翻译源程序的。词法分析的任务是:从左至右逐个字符串地对源程序进行扫描,产生一个个的单词符号,把作为字符串的源程序改造成为单词符号串的中间程序。
词法分析器的功能是输入源程序,输出单词符号。单词符号是一个程序语言的基本语法符号。程序语言的单词符号一般可分为下列五种。
(1)关键字是由程序语言定义的具有固定意义的标识符。有时称这些标识符为保留字或基本字。
(2)标识符用来表示各种名字,如变量名、数组名、过程名等等。
(3)常数常数的类型一般有整形、实型、布尔型、文字型等等。
(4)运算符如+、-、*、/等。
(5)界符如逗号、分号、括号等。
此法分析器输出的单词符号通常表示为下列的二元式:
(单词种别,单词符号的属性值)
要进行词法分析,首先要对源文件进行预处理,即消除源文件中的空格、回车符、换行符等。
过程为:源程序→预处理→词法分析
对源文件进行预处理,输出文本文件,作为词法分析的输入:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
/**
* 功能:对源程序进行预处理<br>
*
* 输入:文本txt格式源程序。<br>
* 输出:去掉所有注释(单行注释、多行注释)、空格、回车、换行符后的文本文件。<br>
* 参数:fileName,源文件绝对路径<br>
* targetFileName,输出文件绝对路径<br>
* 作者:zjl<br>
* 时间:2018年3月19日17:31:01<br>
*/
public class SourceDispose {
public static void pretreatment(String fileName, String targetFileName) throws IOException {
int dquo = 34; //双引号
int squo = 39; //逗号
int[] annoleft = { 47, 42 };//多行注释左/*
int[] annoright = { 42, 47 }; //多行注释右*/
int[] doublefslash = { 47, 47 }; //单行双斜线注释
int[] enterline = { 13, 10 }; //13回车,10换行
File sourceFile = new File(fileName);
if (!sourceFile.exists()) {
System.out.println("文件不存在");
return;
}
FileInputStream fis = new FileInputStream(sourceFile);
File targetFile = new File(targetFileName);
RandomAccessFile fos = new RandomAccessFile(targetFile, "rw");
int pre = 0;
int current = 0;
int flag = -1;
/*
* flag:
* -1: default;
* 0:双斜线
* 1:多行注释
* 3: in single quotation;
* 4: in double quotation;
* */
while((current = fis.read())!=-1) {//未到达文件结尾,循环。
switch(flag){
case 0:{
//检索到单行注释后,略过内容,直至回车换行,转至default
if ((pre == enterline[0] && current == enterline[1])) {flag = -1;}
}
break;
case 1:{
//检索到多行注释左部,略过内容,直至检索到多行注释右部分,转至default
if ((pre == annoright[0] && current == annoright[1])) {flag = -1;}
}
break;
case 3:{
if (current == squo) {flag = -1;}
if (current!=9 && current!=10 && current!=13) {
fos.write((char)current);
}
}
break;
case 4:{
if (current == dquo) {flag = -1;}
if (current!=9 && current!=10 && current!=13) {
fos.write((char)current);
}
}
break;
default:{
//9制表符 10换行 32空格 13回车
if (current!=9 && current!=10 && current!=32 && current!=13) {
fos.write((char)current);
}
//单行双斜线注释
if (pre == doublefslash[0] && current == doublefslash[1]) {
flag = 0;
fos.seek(fos.getFilePointer()-2); //文件指针重定位
/*seek();
* Sets the file-pointer offset, measured from the beginning of this file,
* at which the next read or write occurs. The offset may be set beyond
* the end of the file. Setting the offset beyond the end of the file
* does not change the file length. The file length will change only by
* writing after the offset has been set beyond the end of the file.
*/
}else if (pre == annoleft[0] && current == annoleft[1]) {
//检索到单行注释左部分/*
flag = 1;
fos.seek(fos.getFilePointer()-2);
}else if (current == squo) {
flag = 3;
}else if (current == dquo) {
flag = 4;
}
}
break;
}
pre = current;
}
fis.close();
fos.close();
System.out.println("源文件预处理成功");
}
}
import java.io.IOException;
public class SourceDisposeTest {
public static void main(String[] args){
try {
/*SourceDispose.pretreatment("C:\\Users\\Administrator\\Desktop\\MyString.txt",
"C:\\Users\\Administrator\\Desktop\\output.txt");
*/
SourceDispose.pretreatment("C:\\Users\\Administrator\\Desktop\\BinarySearch.txt",
"C:\\Users\\Administrator\\Desktop\\BinarySearchOutput.txt");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
LexicalAnalyzer.LexicalAnalyze("C:\\Users\\Administrator\\Desktop\\BinarySearchOutput.txt",
"C:\\Users\\Administrator\\Desktop\\WordsOutput.txt");
}
}
源文件预处理,效果。
词法分析器,预处理的输出文件作为词法分析的输入:
package com;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class WordAnalysis {
public static int length = 0;
static WordsType sym = WordsType.SYM_IDENTIFIER;// 识别出来的字符串的类型
static String id = ""; // 识别出来的字符串
static Integer countID = 1; // 记录标识符
static Integer countNUM = 1;// 记录数组
static Map<String, Integer> comMap = new HashMap<String, Integer>();// 普通类型的列表
static Map<String, Integer> idMap = new HashMap<String, Integer>();// 标识符
static Map<String, Integer> numMap = new HashMap<String, Integer>();// 数字
public enum WordsType {
SYM_IDENTIFIER, // 标识符
SYM_NUMBER, // 常数
SYM_PLUS, // +
SYM_MINUS, // -
SYM_TIMES, // *
SYM_SLASH, // /
SYM_ODD, // odd
SYM_EQU, // =
SYM_NEQ, // <>
SYM_LES, // <
SYM_LEQ, // <=
SYM_GTR, // >
SYM_GEQ, // >=
SYM_LPAREN, // (
SYM_RPAREN, // )
SYM_COMMA, // ,
SYM_SEMICOLON, // ;
SYM_PERIOD, // .
SYM_BECOMES, // :=
SYM_BEGIN, // begin
SYM_END, // end
SYM_IF, // if
SYM_THEN, // then
SYM_WHILE, // while
SYM_DO, // do
SYM_CONST, // const
SYM_VAR, // var
SYM_CALL, // call
SYM_PROCEDURE; // procedure
public String toString(WordsType wt) {
String word = "";
switch (wt) {
case SYM_IDENTIFIER:
word = "id";
break;
case SYM_PLUS:
word = "+";
break;
case SYM_TIMES:
word = "*";
break;
case SYM_MINUS:
word = "-";
break;
case SYM_SLASH:
word = "/";
break;
case SYM_LPAREN:
word = "(";
break;
case SYM_RPAREN:
word = ")";
break;
case SYM_NUMBER:
word = "num";
break;
}
return word;
}
}
// len为开始位置
public static WordsType analysis(char[] temp, int len) {
String sys = "";
length = len;
while (temp[length] == '\t' || temp[length] == ' ')
length++; // 过滤制表符,空格
// 字母序列
if (length < temp.length && isLetter(temp[length])) {
while (length < temp.length && temp[length] != ' '
&& isLetter(temp[length])) {
sys += temp[length];
length++;
}
id = sys;
WordsType wt = reserve(id);
if (length == temp.length) {
comMap.put(id, null);
}
if (length < temp.length && !isLetter(temp[length])) {
if (wt == WordsType.SYM_IDENTIFIER) {
if (!idMap.containsKey(id)) {
idMap.put(id, countID);// 标识符列表。如果标识符已经存在则不再添加
countID++;
}
} else
comMap.put(id, null);
}
return wt;
}
// 数字序列
if (length < temp.length && isDigit(temp[length])) {
while (length < temp.length && temp[length] != ' '
&& isDigit(temp[length])) {
sys += temp[length];
length++;
}
if (length < temp.length && !isDigit(temp[length])) {
id = sys;
sym = WordsType.SYM_NUMBER;
if (!numMap.containsKey(id)) {
numMap.put(id, countNUM); // 数字列表
countNUM++;
}
return sym;
}
}
// 运算符
if (length < temp.length)
return caculator(temp);
return null;
}
// 判断基本运算
public static WordsType caculator(char[] temp) {
sym = null;
int len = length;
length++;
id = temp[len] + "";
switch (temp[len]) {
case '+':
sym = WordsType.SYM_PLUS;
break;
case '-':
sym = WordsType.SYM_MINUS;
break;
case '*':
sym = WordsType.SYM_TIMES;
break;
case '/':
sym = WordsType.SYM_SLASH;
break;
case '(':
sym = WordsType.SYM_LPAREN;
break;
case ')':
sym = WordsType.SYM_RPAREN;
break;
case ',':
sym = WordsType.SYM_COMMA;
break;
case ';':
sym = WordsType.SYM_SEMICOLON;
break;
case '.':
sym = WordsType.SYM_PERIOD;
break;
case '=':
sym = WordsType.SYM_BECOMES;
break;
}
if (temp[len] == '>' && (len + 1 < temp.length) && temp[len + 1] == '=') {
length++;
id += temp[len + 1];
sym = WordsType.SYM_GEQ;
} else if (temp[len] == '>' && (len + 1 < temp.length)
&& temp[len + 1] != '=') {
sym = WordsType.SYM_GTR;
} else if (temp[len] == '<' && (len + 1 < temp.length)
&& temp[len + 1] != '=') {
sym = WordsType.SYM_LES;
} else if (temp[len] == '<' && (len + 1 < temp.length)
&& temp[len + 1] == '=') {
length++;
id += temp[len + 1];
sym = WordsType.SYM_LEQ;
}
return sym;
}
// 判断单个字符是否为字母
public static boolean isLetter(char a) {
if ((a >= 'a' && a <= 'z') || (a >= 'A' && a <= 'Z'))
return true;
return false;
}
// 判断单个字符是否为数字
public static boolean isDigit(char a) {
if (a >= '0' && a <= '9')
return true;
return false;
}
// 保留字
public static WordsType reserve(String str) {
if ("begin".equals(str))
return WordsType.SYM_BEGIN;
else if ("end".equals(str))
return WordsType.SYM_END;
else if ("const".equals(str))
return WordsType.SYM_CONST;
else if ("if".equals(str))
return WordsType.SYM_IF;
else if ("then".equals(str))
return WordsType.SYM_THEN;
else if ("while".equals(str))
return WordsType.SYM_WHILE;
else if ("do".equals(str))
return WordsType.SYM_DO;
else if ("var".equals(str))
return WordsType.SYM_VAR;
else if ("procedure".equals(str))
return WordsType.SYM_PROCEDURE;// 保留字
else
return WordsType.SYM_IDENTIFIER;// 标识符
}
public static List<String> getInputStream(String filename)
throws IOException {
List<String> list = new ArrayList<String>();
FileInputStream fis = new FileInputStream(filename);
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String buffer = "";
while ((buffer = br.readLine()) != null) {
list.add(buffer);
}
return list;
}
public static void output(String str) {
length = 0;// 初始化
char[] chs = str.toCharArray();
while (length < str.length()) {
WordsType wt = analysis(chs, length);
if (wt == WordsType.SYM_NUMBER)
//System.out.println(wt + ":" + id + ",\t\t\t位置:" + numMap.get(id));
System.out.println("位置:"+numMap.get(id)+"\t\t\t"+wt+":"+id);
else if (wt == WordsType.SYM_IDENTIFIER)
//System.out.println(wt + ":" + id + ",\t\t\t位置:" + idMap.get(id));
System.out.println("位置:"+idMap.get(id)+"\t\t\t"+wt+":"+id);
else
//System.out.println(wt + ":" + id + ",\t\t\t位置:" + comMap.get(id));
System.out.println("位置:"+comMap.get(id)+"\t\t\t"+wt+":"+id);
}
}
public static void main(String[] args) throws IOException {
List<String> list = getInputStream("C:\\Users\\Administrator\\Desktop\\BinarySearchOutput.txt");
for (int i = 0; i < list.size(); i++) {
output(list.get(i));
System.out.println("******end*******");
}
}
}
词法分析效果:
其他章节:
一、引论
二、高级语言及其语法描述
语法分析之消除左递归、消除回溯
三、词法分析
四、语法分析
算符优先分析 如有错误之处,欢迎指正。