在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会仔细解释词法分析。
在源文件中解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类
1.显然我们需要列举出所有的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,而且其中的字母只能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面可以接大小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。
2.对于常量,这里需要注意的是整型和浮点型常量。
3.运算符按照的是下面的表:
C语言运算符表
运算符按照优先级大小由上向下排列,在同一行的运算符具有相同优先级。第二行是所有的一元运算符。
|
|
|
|
| () [] -> . | 括号(函数等),数组,两种结构成员访问 |
|
| ! ~ ++ -- + -
* & |
否定,按位否定,增量,减量,正负号,
间接,取地址 |
|
| * / % | 乘,除,取模 |
|
| + - | 加,减 |
|
| << >> | 左移,右移 |
|
| < <= >= > | 小于,小于等于,大于等于,大于 |
|
| == != | 等于,不等于 |
|
| & | 按位与 |
|
| ^ | 按位异或 |
|
| | | 按位或 |
|
| && | 逻辑与 |
|
| || | 逻辑或 |
|
| ? : | 条件 |
|
| = += -= *= /=
&= ^= |= <<= >>= |
各种赋值 |
|
| , | 逗号(顺序) |
|
4.界符:“;”“{}”,单引号,双引号
接下来我介绍的是对保留字的归类,为了查找方便,将保留字按照a-z的顺序排好,依据数组的下标定位,减少寻找的时间
/*
* keyword.h
*
* Created on: Jun 12, 2014
*
*/
#ifndef KEYWORD_H_
#define KEYWORD_H_
struct keyword{
char *keyName;
};
static struct keyword key__[]={
{"__int64"},
{"end"}
};
static struct keyword key_A[]={
{"auto"},
{"end"}
};
static struct keyword key_B[]={
{"break"},
{"end"}
};
static struct keyword key_C[]={
{"case"},
{"char"},
{"const"},
{"continue"},
{"end"}
};
static struct keyword key_D[]={
{"default"},
{"do"},
{"double"},
{"end"}
};
static struct keyword key_E[]={
{"else"},
{"enum"},
{"extern"},
{"end"}
};
static struct keyword key_F[]={
{"float"},
{"for"},
{"end"}
};
static struct keyword key_G[]={
{"goto"},
{"end"}
};
static struct keyword key_H[]={
{"end"}
};
static struct keyword key_I[]={
{"if"},
{"int"},
{"end"}
};
static struct keyword key_J[]={
{"end"}
};
static struct keyword key_K[]={
{"end"}
};
static struct keyword key_L[]={
{"long"},
{"end"}
};
static struct keyword key_M[]={
{"end"}
};
static struct keyword key_N[]={
{"end"}
};
static struct keyword key_O[]={
{"end"}
};
static struct keyword key_P[]={
{"end"}
};
static struct keyword key_Q[]={
{"end"}
};
static struct keyword key_R[]={
{"register"},
{"return"},
{"end"}
};
static struct keyword key_S[]={
{"short"},
{"signed"},
{"sizeof"},
{"static"},
{"struct"},
{"switch"},
{"end"}
};
static struct keyword key_T[]={
{"typedef"},
{"end"}
};
static struct keyword key_U[]={
{"union"},
{"unsigned"},
{"end"}
};
static struct keyword key_V[]={
{"void"},
{"volatile"},
{"end"}
};
static struct keyword key_W[]={
{"while"},
{"end"}
};
static struct keyword key_X[]={
{"end"}
};
static struct keyword key_Y[]={
{"end"}
};
static struct keyword key_Z[]={
{"end"}
};
// size is 27
static struct keyword *keywords[]={
key__,key_A,key_B,key_C,key_D,key_E,
key_F,key_G,key_H,key_I,key_J,key_K,
key_L,key_M,key_N,key_O,key_P,key_Q,
key_R,key_S,key_T,key_U,key_V,key_W,
key_X,key_Y,key_Z
};
#endif /* KEYWORD_H_ */
下面是词法分析的源码;
/* * lex.h * * Created on: Jun 13, 2014 * */ #include "input.h" #include "keyword.h" #define isDigit(c) (c>='0' && c<='9') #define isUpperLetter(c) (c>='A' && c <='Z') #define isLowerLetter(c) (c>='a' && c<='z') #define isLetter(c) (isUpperLetter || isLowerLetter)
/*
* lex.c
*
* Created on: Jun 13, 2014
*
*/
#include "zcc.h"
#include "lex.h"
#define curr source.cursor
int getToken() {
char a[100];
int a_length, i, flag;
/*
*skip ' ','\n' and '\b'
*/
while (*curr == ' ' || *curr == 10 || *curr == 9) {
curr++;
if (*curr == END_OF_FILE) {
return -1;
}
}
/* name or keyword on first is a-z */
a_length=0;
if (*curr >= 'a' && *curr <= 'z') {
IDAndKey:
a_length = 0;
do {
a[a_length++] = *curr++;
} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
|| *curr == '_');
a[a_length] = '\0';
i = 0;
flag = 0;
if (*a - 'a' <= 26 && *a - 'a' >= 0) {
while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
flag = 1;
break;
}
i++;
}
if (flag == 1) {
printf("keyword is %s\n", a);
return 1;
} else {
printf("Identify is %s\n", a);
return 1;
}
} else {
printf("Identify is %s\n", a);
return 1;
}
} else if (isUpperLetter(*curr)) {
goto IDAndKey;
} else if (isDigit(*curr)) {
a_length = 0;
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
//float number
if (*curr == '.') {
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
a[a_length] = '\0';
printf("float number is %s\n", a);
return 1;
} else {
// number
a[a_length] = '\0';
printf("number is %s\n", a);
return 1;
}
/*
* Operator begin
* */
} else if (*curr == '<') {
a[a_length++] = *curr++;
if (*curr == '<') {
a[a_length++] = *curr++;
lastOperatorDeal:
a[a_length] = '\0';
printf("Operator is %s\n", a);
return 1;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '>') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '=') {
a[a_length++] = *curr++;
if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '(') {
singleOperator:
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == ')') {
goto singleOperator;
} else if (*curr == '[') {
goto singleOperator;
} else if (*curr == ']') {
goto singleOperator;
} else if (*curr == '-') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '-') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
}else if(*curr=='.'){
goto singleOperator;
}else if(*curr=='!'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='~'){
goto singleOperator;
}else if(*curr=='+'){
a[a_length++]=*curr++;
if(*curr=='+'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='-'){
a[a_length++]=*curr++;
if(*curr=='-'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='*'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='&'){
a[a_length++]=*curr++;
if(*curr=='&'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='/'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}if(*curr=='/'){
// skip line
while(*curr!='\n'){
if(*curr==END_OF_FILE)
return -1;
curr++;
}
}else if(*curr=='*'){
curr++;
// skip "/**/"
while(*curr!=END_OF_FILE)
{
if(*curr=='*' && *(curr+1)=='/'){
curr+=2;
break;
}
curr++;
}
}else{
goto lastOperatorDeal;
}
}else if(*curr=='%'){
a[a_length++]=*curr++;
if(*curr=='d'){
goto singleOperator;
}else if(*curr=='c'){
goto singleOperator;
}else if(*curr=='f'){
goto singleOperator;
}else if(*curr=='l'){
a[a_length++]=*curr++;
if(*curr=='d')
goto singleOperator;
else if(*curr=='f')
goto singleOperator;
else
goto singleOperator;
}
}else if(*curr=='^'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='|'){
a[a_length++]=*curr++;
if(*curr=='|'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='?'){
goto singleOperator;
}else if(*curr==':'){
goto singleOperator;
}else if(*curr==','){
goto singleOperator;
}else if(*curr=='\\'){
a[a_length++]=*curr++;
if(*curr=='n'){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}
/*
* Operator end
* */
/*
* delimiter begin
* */
else if(*curr=='{'){
singleDelimiter:
a[a_length++]=*curr++;
a[a_length]='\0';
printf("Delimiter is %s\n", a);
return 1;
}else if(*curr=='}'){
goto singleDelimiter;
}else if(*curr==';'){
goto singleDelimiter;
}else if(*curr=='\''){
goto singleDelimiter;
}else if(*curr=='\"'){
goto singleDelimiter;
}
}
这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。
忘了说了,我将自己写的编译器命名为:ZCC,头文件都包含在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样神奇的玩意。
最后看测试文档:
struct Student{
int a;
char* name;
}
int main()
{
int a=123;
float a2=1.2345677;
int b=1+3;
for(int i=0; i < 100; i++)
a+=i;
printf("%d\n", a);
return 0;
}
测试结果:
keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }
做到这里,可以告一小段落了,接下来做的事情就是语法分析。
原文地址:http://blog.csdn.net/q745401990/article/details/30748269