动手写个词法分析器

词法分析是编译的第一步，是将字节流转换为 Token 流（标记流）的过程。

这里实现的是一个 C 语言的词法分析，词法规则采用的是 ANSI C grammar, Lex specification

这里的实现主要参考的是 https://github.com/cesanta/mjs/blob/master/mjs/src/mjs_tok.c

mjs 是一个适用于单片机的 JavaScript 解释器，小巧，低资源占用。

定义结构体！

Token

除了单字符操作符以外，其余所有的记号，无论是多字节操作符，还是关键字，都将通过枚举类型进行定义。

💡

单字符操作符 ";" "{" "}" "," ":" "=" "(" ")" "[" "]" "." "&" "!" "~" "-" "+" "*" "/" "%" "<" ">" "^" "|" "?"

关键字也是标识符，但也不于标识符，是被保留的标识符。这里单独区分出来。


enum {
    TOK_EOF,  // 读到了尽头
    TOK_INVALID,  // 出现了词法错误

    TOK_RIGHT_ASSIGN,
    TOK_LEFT_ASSIGN,
    TOK_ADD_ASSIGN,
    TOK_SUB_ASSIGN,
    TOK_MUL_ASSIGN,
    TOK_DIV_ASSIGN,
    TOK_MOD_ASSIGN,
    TOK_AND_ASSIGN,
    TOK_XOR_ASSIGN,
    TOK_OR_ASSIGN,
    TOK_RIGHT_OP,
    TOK_LEFT_OP,
    TOK_INC_OP,
    TOK_DEC_OP,
    TOK_PTR_OP,
    TOK_AND_OP,
    TOK_OR_OP,
    TOK_LE_OP,
    TOK_GE_OP,
    TOK_EQ_OP,
    TOK_NE_OP,
    TOK_CONSTANT,
    TOK_STRING_LITERAL,
    TOK_ELLIPSIS,

    TOK_AUTO = 200, // 为了避免和'+' , '-' 等 ascii 码片冲突
    // 此外 200 以上都属于标识符 (IDENTIFIER)
    TOK_BREAK,
    TOK_CASE,
    TOK_CHAR,
    TOK_CONST,
    TOK_CONTINUE,
    TOK_DEFAULT,
    TOK_DO,
    TOK_DOUBLE,
    TOK_ELSE,
    TOK_ENUM,
    TOK_EXTERN,
    TOK_FLOAT,
    TOK_FOR,
    TOK_GOTO,
    TOK_IF,
    TOK_INT,
    TOK_LONG,
    TOK_REGISTER,
    TOK_RETURN,
    TOK_SHORT,
    TOK_SIGNED,
    TOK_SIZEOF,
    TOK_STATIC,
    TOK_STRUCT,
    TOK_SWITCH,
    TOK_TYPEDEF,
    TOK_UNION,
    TOK_UNSIGNED,
    TOK_VOID,
    TOK_VOLATILE,
    TOK_WHILE,
    TOK_IDENTIFIER
};

下面定义一个 Token 流元素的结构。

tok 对于单字符操作符，这里存储的就是字符本身 ( ASCII 码 ) ，其他的存储的是枚举类型的数值。

len 该标记的长度比如对于 struct 长度就是 5 , == 长度就是 2 , "asdf" 长度就是 6 。

ptr 该标记的在源码字符串中的起始位置。

line_num 该标记所在的行号。


typedef struct token_s {
    int tok; 
    int len;
    char *ptr;
    int line_num; // 行号
} Tok;

Lexer

该结构体存储了词法分析器的状态

cur 指针指向当前正在处理的字符（相对于源码）

tok 表示正在解析的标记

line_num 毫无疑问的是行号


struct lexer_s {
    char *cur;
    Tok tok;
    int line_num;
};

初始化一个词法分析器需要传入待分析的代码作为输入。


Lexer lex_init(const char *code) {
    struct lexer_s *l = calloc(sizeof(struct lexer_s), 1);
    l->cur = code;
    return l;
}

主要结构

定义next方法每次产生一个新的 Tok 并将其返回，词法分析的过程就是不断调用 next 方法直到其返回的 Tok 的种类是 TOK_EOF 。


main(){
// ...
	while ((tok = lex_next(l))->tok != TOK_EOF) {
	   // do something with tok
	}
// ...
}

next 方法终于工作是，通过当前字符以及当前字符的下一个字符，或者当前字符的后几个字符，来确定是哪种类型的标记，并将其返回。所以整体结构是许多 IF-ELSE 的堆叠。但需要注意的是，像空行、空格、注释等都是无法产生合法标记流的字符，需要将他们跳过。


static Tok *next(struct lexer_s *l) {
    int tok = TOK_INVALID;

    skip_spaces_and_comments(l);
    l->tok.ptr = l->cur;
    l->tok.line_num = l->line_num;

    if (*l->cur == '\0') {
        tok = TOK_EOF;
    } else if (is_digit(*l->cur)) {
        tok = get_num(l);
		} else if ...

    if (l->cur != '\0') l->cur++;
    l->tok.tok = tok;
    return &l->tok;

}

skip_spaces_and_comments 跳过无效字符

无效字符有三种

空白字符（空格、制表符、换行符等）正则: [ \t\v\n\f]

单行注释以 // 开头，直到行末

多行注释以 /* 开头直到 */


static bool is_space(int c) {
    return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\f' ||
           c == '\v';
}
static void skip_spaces_and_comments(struct lexer_s *l) {
    const char *pos;
    do {
        pos = l->cur;
        // 处理空白字符
        while (is_space(l->cur[0])) {
            if (l->cur[0] == '\n') l->line_num++;
            l->cur++;
        }

        // 处理单行注释
        if (l->cur[0] == '/' && l->cur[1] == '/') {
            while (l->cur[0] != '\0' && l->cur[0] != '\n') l->cur++;
        }

        // 处理多行注释
        if (l->cur[0] == '/' && l->cur[1] == '*') {
            l->cur += 2;
            while (l->cur[0] != '\0') {
                if (l->cur[0] == '\n') l->line_num++;
                if (l->cur[0] == '*' && l->cur[1] == '/') {
                    l->cur += 2;
                    break;
                }
                l->cur++;
            }
        }
    } while (pos < l->cur);
}

is_alpha & get_identifier & is_keyword 处理标识符和关键字

再定义两个简洁直观的辅助函数 is_alpha 和 is_digit 用于判断当前字符是否为字母和数字。


static bool is_alpha(int c) {
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

static bool is_digit(int c) {
    return c >= '0' && c <= '9';
}

想一想 C 语言中的标识符的规则是什么，也就是变量的合法名称，以字母或下划线开头，后面可以跟随下划线、字母和数字，用正则表达式表示就是 [a-zA-Z_]([[0-9a-zA-Z_]])*

在 next 函数中添加这么一条判断，如果当前字符符合标识符规则，那么就获取一个标识符。


if (is_alpha(*l->cur) || l->cur[0] == '_') {
   tok = get_identifier(l);
   int tok2 = is_keyword(l->tok.ptr, l->tok.len);
   if (tok2) tok = tok2;


static int get_identifier(struct lexer_s *l) {
    l->cur++;
    while (l->cur[0] != '\0' && (is_alpha(*l->cur) || 
					is_digit(*l->cur) || l->cur[0] == '_')) l->cur++;
    l->tok.len = l->cur - l->tok.ptr;
    l->cur--;
    return TOK_IDENTIFIER;
}

我们知道，关键字也是标识符，使用 is_keyword 加以区分。


static int is_keyword(const char *s, int len) {
    const char *reserved[] =
            {"auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else",
             "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
             "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned",
             "void", "volatile", "while", NULL};
    if (!is_alpha(s[0])) return 0;
    for (int i = 0; reserved[i] != NULL; i++) {
        if (len == (int) strlen(reserved[i]) && strncmp(s, reserved[i], len) == 0)
            return i + TOK_AUTO;
    }
    return 0;
}

将得到的标识符去与关键字列表进行判等匹配，由于reserved中的关键字列表顺序和 enum 中关键字顺序一致，使用 i + TOK_AUTO 进行偏移简化代码书写。

处理数字常量

C 语言中数字常量有一下几种形式

'a' 字符常量

12343 整数类型

0x123 16 进制数字
0b1101 2 进制数字

3.14 浮点数字

123e+2f 科学计数法


if (is_digit(*l->cur)) {
        tok = get_num(l);
} else if (*l->cur == '\'') {
        tok = get_char(l);


static int get_num(struct lexer_s *l) {
    strtod(l->cur, &l->cur);
    l->tok.len = l->cur - (char *) l->tok.ptr;
    l->cur--;
    return TOK_CONSTANT;
}
// 对于 '\0' 这种形式，两个单引号之间未必只有一个字符
static int get_char(struct lexer_s *l) {
    l->cur++;
    if (l->cur[0] == '\\') l->cur++; // skip \0 ...
    if (l->cur[0] != '\0') l->cur++;
    if (l->cur[0] == '\'') {
        l->tok.len = l->cur - l->tok.ptr+1;
        return TOK_CONSTANT;
    }
    return TOK_INVALID;
}

由于这里只做标记的分割，只需要知道这个标记到底有多长，不做具体类型的区分，故使用 strtod 可以同时处理整数、浮点数（含科学计数法形式）。

处理字符串常量

字符串常量就是以双引号包围的字符串。


else if (*l->cur == '\"') {
        tok = get_string_literal(l);


static int get_string_literal(struct lexer_s *l) {
    l->cur++;
    while (l->cur[0] != '\0' && l->cur[0] != '\"') {
        if (l->cur[0] == '\\' && l->cur[1] != '\0' &&
						// 遇到转译符要跳过两个，避免触发 l->cur[0] != '\"' 错误滴结束循环
            (l->cur[1] == '\"' || strchr("bfnrtv\\", l->cur[1]) != NULL)) {
            l->cur += 2;
        } else {
            l->cur++;
        }
    }
    l->tok.len = l->cur - l->tok.ptr + 1;
    return TOK_STRING_LITERAL;
}

最后处理单目双目运算符

这里需要向后看1之多个字符，定义以下辅助函数。


static bool is_token_2(struct lexer_s *l, const char tok[2]) {
    return l->cur[0] == tok[0] && l->cur[1] == tok[1];
}

static bool is_token_3(struct lexer_s *l, const char tok[3]) {
    return l->cur[0] == tok[0] && l->cur[1] == tok[1] && l->cur[2] == tok[2];
}

对于单字符操作符处理比较简单


if (strchr(";{},:=()[].&!~-+*/%<>^|?", *l->cur) != NULL) {
        tok = *l->cur;
}

多字符操作符也大同小异


if (is_token_2(l, "+=")) {
            tok = TOK_ADD_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "-=")) {
            tok = TOK_SUB_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "*=")) {
            tok = TOK_MUL_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "/=")) {
            tok = TOK_DIV_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "%=")) {
            tok = TOK_MOD_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "&=")) {
            tok = TOK_AND_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "^=")) {
            tok = TOK_XOR_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "|=")) {
            tok = TOK_OR_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, ">>")) {
            tok = TOK_RIGHT_OP;
            l->cur += 1;
        } else if (is_token_2(l, "<<")) {
            tok = TOK_LEFT_OP;
            l->cur += 1;
        } else if (is_token_2(l, "++")) {
            tok = TOK_INC_OP;
            l->cur += 1;
        } else if (is_token_2(l, "--")) {
            tok = TOK_DEC_OP;
            l->cur += 1;
        } else if (is_token_2(l, "->")) {
            tok = TOK_PTR_OP;
            l->cur += 1;
        } else if (is_token_2(l, "&&")) {
            tok = TOK_AND_OP;
            l->cur += 1;
        } else if (is_token_2(l, "||")) {
            tok = TOK_OR_OP;
            l->cur += 1;
        } else if (is_token_2(l, "<=")) {
            tok = TOK_LE_OP;
            l->cur += 1;
        } else if (is_token_2(l, ">=")) {
            tok = TOK_GE_OP;
            l->cur += 1;
        } else if (is_token_2(l, "==")) {
            tok = TOK_EQ_OP;
            l->cur += 1;
        } else if (is_token_3(l, "...")) {
            tok = TOK_ELLIPSIS;
            l->cur += 2;
        } else if (is_token_3(l, ">>=")) {
            tok = TOK_RIGHT_ASSIGN;
            l->cur += 2;
        } else if (is_token_3(l, "<<=")) {
            tok = TOK_LEFT_ASSIGN;
            l->cur += 2;
        }

完整的 next 函数


static Tok *next(struct lexer_s *l) {
    int tok = TOK_INVALID;

    skip_spaces_and_comments(l);
    l->tok.ptr = l->cur;
    l->tok.line_num = l->line_num;

    if (*l->cur == '\0') {
        tok = TOK_EOF;
    } else if (is_digit(*l->cur)) {
        tok = get_num(l);

    } else if (*l->cur == '\'') {
        tok = get_char(l);
    } else if (*l->cur == '\"') {
        tok = get_string_literal(l);
    } else if (is_alpha(*l->cur) || l->cur[0] == '_') {
        tok = get_identifier(l);
        int tok2 = is_keyword(l->tok.ptr, l->tok.len);
        if (tok2) tok = tok2;
    } else if (strchr(";{},:=()[].&!~-+*/%<>^|?", *l->cur) != NULL) {
        tok = *l->cur;
        if (is_token_2(l, "+=")) {
            tok = TOK_ADD_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "-=")) {
            tok = TOK_SUB_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "*=")) {
            tok = TOK_MUL_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "/=")) {
            tok = TOK_DIV_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "%=")) {
            tok = TOK_MOD_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "&=")) {
            tok = TOK_AND_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "^=")) {
            tok = TOK_XOR_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, "|=")) {
            tok = TOK_OR_ASSIGN;
            l->cur += 1;
        } else if (is_token_2(l, ">>")) {
            tok = TOK_RIGHT_OP;
            l->cur += 1;
        } else if (is_token_2(l, "<<")) {
            tok = TOK_LEFT_OP;
            l->cur += 1;
        } else if (is_token_2(l, "++")) {
            tok = TOK_INC_OP;
            l->cur += 1;
        } else if (is_token_2(l, "--")) {
            tok = TOK_DEC_OP;
            l->cur += 1;
        } else if (is_token_2(l, "->")) {
            tok = TOK_PTR_OP;
            l->cur += 1;
        } else if (is_token_2(l, "&&")) {
            tok = TOK_AND_OP;
            l->cur += 1;
        } else if (is_token_2(l, "||")) {
            tok = TOK_OR_OP;
            l->cur += 1;
        } else if (is_token_2(l, "<=")) {
            tok = TOK_LE_OP;
            l->cur += 1;
        } else if (is_token_2(l, ">=")) {
            tok = TOK_GE_OP;
            l->cur += 1;
        } else if (is_token_2(l, "==")) {
            tok = TOK_EQ_OP;
            l->cur += 1;
        } else if (is_token_3(l, "...")) {
            tok = TOK_ELLIPSIS;
            l->cur += 2;
        } else if (is_token_3(l, ">>=")) {
            tok = TOK_RIGHT_ASSIGN;
            l->cur += 2;
        } else if (is_token_3(l, "<<=")) {
            tok = TOK_LEFT_ASSIGN;
            l->cur += 2;
        }
        l->tok.len = l->cur - l->tok.ptr + 1;

    }

    if (l->cur != '\0') l->cur++;
    l->tok.tok = tok;
    return &l->tok;

}