一直以来对编译器/解释器等都较有兴趣。我非科班出身,当初还在大学时,只是马马虎虎看完了《编译原理》之类教材,上机非常少,对龙书之类圣经也只是浅尝辄止而已。工作至今,基本已将编译原理相关知识忘记得差不多了,可能也就还对譬如预处理词法分析语法分析 AST 生成等基础性的概念还有点印象罢。
约 1 年多前,我也有想法搞一套基于简化的 Pascal 语法的带类型的脚本语言“编译器”(PaxCompiler 之类可能太复杂了),并将此脚本语言编写的脚本与 Golang 交互起来。当然这只是我个人的业余兴趣而已,至于是否会付诸行动、能搞成怎样都是未知。而选择 Pascal 作为参考改编语言的原因,其一我比较喜欢它的语言设计,其二它曾是我某段时间内的工作语言所以感情成分使然,其三较之诸如 Python、Lua 我更喜欢带类型的脚本语言(TypeScript?我不太喜欢 JavaScript 的语法...),当然,Pascal 的语法形式也确实比较方便为之开发编译器/解释器。
而短期内,个人恐怕没有太多精力去啃龙书之类,于是索性,看点基础资料且按此系列教程之类慢慢温习并从 tokenizer 开始一步步实现自己的 EcoPascal——即便最终,它只是个玩具脚本语言而已。
近 2 天趁有空,粗略看了前文所述教程的前两章,并以 Golang 重写了这两章里的解释程序(代码写得有些粗放)。
第一章:
package interpreter import ( "fmt" ) // Token types // // EOF (end-of-file) token is used to indicate that // there is no more input left for lexical analysis type TokenType int const ( cTokenTypeOfNone TokenType = iota cTokenTypeOfInteger cTokenTypeOfPlusSign cTokenTypeOfEOF ) type token struct { t TokenType // token type: INTEGER, PLUS, or EOF v interface{} // token value: 0, 1, 2. 3, 4, 5, 6, 7, 8, 9, ‘+‘, or None } func newToken(t TokenType, v interface{}) token { return token{ t: t, v: v, } } type Interpreter struct { text []rune // client string input, e.g. "3+5" pos int // an index into text currToken token // current token instance } func New() *Interpreter { return &Interpreter{ text: []rune(""), pos: 0, currToken: newToken(cTokenTypeOfNone, nil), } } func convToDigit(c rune) (int, bool) { if ‘0‘ <= c && c <= ‘9‘ { return int(c - ‘0‘), true } return 0, false } // Lexical analyzer (also known as scanner or tokenizer) // // This method is responsible for breaking a sentence apart into tokens. // One token at a time. func (self *Interpreter) getNextToken() token { text := self.text // is self.pos index past the end of the self.text ? // if so, then return EOF token because there is no more // input left to convert into tokens if self.pos > len(text)-1 { return newToken(cTokenTypeOfEOF, nil) } // get a character at the position self.pos and decide // what token to create based on the single character // var currChar interface{} = text[self.pos] currChar := text[self.pos] // if the character is a digit then convert it to // integer, create an INTEGER token, increment self.pos // index to point to the next character after the digit, // and return the INTEGER token if v, ok := convToDigit(text[self.pos]); ok { self.pos += 1 return newToken(cTokenTypeOfInteger, v) } if currChar == ‘+‘ { self.pos += 1 return newToken(cTokenTypeOfPlusSign, ‘+‘) } panic(fmt.Sprintf("Error parsing input1: %s", string(self.text))) } // compare the current token type with the passed token type // and if they match then "eat" the current token // and assign the next token to the self.currToken, // otherwise raise an exception. func (self *Interpreter) eat(tokenType TokenType) { if self.currToken.t == tokenType { self.currToken = self.getNextToken() return } panic(fmt.Sprintf("Error parsing input: %s", self.text)) } // parse "INTEGER PLUS INTEGER" func (self *Interpreter) Parse(s string) int { self.text = []rune(s) self.pos = 0 // set current token to the first token taken from the input self.currToken = self.getNextToken() // we expect the current token to be a single-digit integer left := self.currToken self.eat(cTokenTypeOfInteger) // we expect the current token to be a ‘+‘ token // op := self.currToken self.eat(cTokenTypeOfPlusSign) // we expect the current token to be a single-digit integer right := self.currToken self.eat(cTokenTypeOfInteger) // after the above call the self.current_token is set to EOF token. // at this point INTEGER PLUS INTEGER sequence of tokens // has been successfully found and the method can just // return the result of adding two integers, thus // effectively interpreting client input return left.v.(int) + right.v.(int) }
第二章:
package interpreter import ( "fmt" "github.com/ecofast/rtl/sysutils" ) // Token types // // EOF (end-of-file) token is used to indicate that // there is no more input left for lexical analysis type TokenType int const ( cTokenTypeOfNone TokenType = iota cTokenTypeOfInteger cTokenTypeOfPlusSign cTokenTypeOfMinusSign cTokenTypeOfEOF ) type token struct { t TokenType // token type: INTEGER, PLUS, MINUS, or EOF v interface{} // token value: non-negative integer value, ‘+‘, ‘-‘, or None } func newToken(t TokenType, v interface{}) token { return token{ t: t, v: v, } } type Interpreter struct { text []rune // client string input, e.g. "3 + 5", "12 - 5", etc pos int // an index into text currToken token // current token instance currChar rune } func New() *Interpreter { return &Interpreter{ text: []rune(""), pos: 0, currToken: newToken(cTokenTypeOfNone, nil), currChar: 0, } } func isDigit(c rune) bool { if ‘0‘ <= c && c <= ‘9‘ { return true } return false } func convToDigit(c rune) (int, bool) { if ‘0‘ <= c && c <= ‘9‘ { return int(c - ‘0‘), true } return 0, false } func isSpace(c rune) bool { if ‘ ‘ == c { return true } return false } // Advance the ‘pos‘ pointer and set the ‘currChar‘ variable func (self *Interpreter) advance() { self.pos += 1 if self.pos > len(self.text)-1 { self.currChar = 0 } else { self.currChar = self.text[self.pos] } } func (self *Interpreter) skipWhiteSpace() { for self.currChar != 0 && isSpace(self.currChar) { self.advance() } } // Return a (multidigit) integer consumed from the input func (self *Interpreter) integer() int { ret := "" for self.currChar != 0 && isDigit(self.currChar) { ret += string(self.currChar) self.advance() } return sysutils.StrToInt(ret) } // Lexical analyzer (also known as scanner or tokenizer) // // This method is responsible for breaking a sentence apart into tokens. func (self *Interpreter) getNextToken() token { for self.currChar != 0 { if isSpace(self.currChar) { self.skipWhiteSpace() continue } if isDigit(self.currChar) { return newToken(cTokenTypeOfInteger, self.integer()) } if self.currChar == ‘+‘ { self.advance() return newToken(cTokenTypeOfPlusSign, ‘+‘) } if self.currChar == ‘-‘ { self.advance() return newToken(cTokenTypeOfMinusSign, ‘-‘) } panic(fmt.Sprintf("Error parsing input1: %s", string(self.text))) } return newToken(cTokenTypeOfEOF, nil) } // compare the current token type with the passed token type // and if they match then "eat" the current token // and assign the next token to the self.currToken, // otherwise raise an exception. func (self *Interpreter) eat(tokenType TokenType) { if self.currToken.t == tokenType { self.currToken = self.getNextToken() return } panic(fmt.Sprintf("Error parsing input: %s", self.text)) } // parse "INTEGER PLUS INTEGER" or "INTEGER MINUS INTEGER" func (self *Interpreter) Parse(s string) int { self.text = []rune(s) self.pos = 0 self.currChar = self.text[self.pos] // set current token to the first token taken from the input self.currToken = self.getNextToken() // we expect the current token to be an integer left := self.currToken self.eat(cTokenTypeOfInteger) // we expect the current token to be either a ‘+‘ or ‘-‘ op := self.currToken if op.t == cTokenTypeOfPlusSign { self.eat(cTokenTypeOfPlusSign) } else { self.eat(cTokenTypeOfMinusSign) } // we expect the current token to be an integer right := self.currToken self.eat(cTokenTypeOfInteger) // after the above call the self.current_token is set to EOF token. // at this point either the INTEGER PLUS INTEGER or // the INTEGER MINUS INTEGER sequence of tokens // has been successfully found and the method can just // return the result of adding or subtracting two integers, thus // effectively interpreting client input if op.t == cTokenTypeOfPlusSign { return left.v.(int) + right.v.(int) } return left.v.(int) - right.v.(int) }
有了“核心”解释程序,使用起来就很简单了:
package main import ( "fmt" "part1/interpreter" ) func main() { fmt.Println("Let‘s Build A Simple Interpreter - Part 1") parser := interpreter.New() s := "" for { if n, err := fmt.Scan(&s); n == 0 || err != nil { return } fmt.Println(parser.Parse(s)) } }
本兴趣项目已托管至 Github,比较可能会不定期慢慢更新。