123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768 |
- package zygo
- import (
- "bytes"
- "errors"
- "fmt"
- "io"
- "regexp"
- "strconv"
- "unicode/utf8"
- )
- type TokenType int
- const (
- TokenTypeEmpty TokenType = iota
- TokenLParen
- TokenRParen
- TokenLSquare
- TokenRSquare
- TokenLCurly
- TokenRCurly
- TokenDot
- TokenQuote
- TokenBacktick
- TokenTilde
- TokenTildeAt
- TokenSymbol
- TokenBool
- TokenDecimal
- TokenHex
- TokenOct
- TokenBinary
- TokenFloat
- TokenChar
- TokenString
- TokenCaret
- TokenColonOperator
- TokenThreadingOperator
- TokenBackslash
- TokenDollar
- TokenDotSymbol
- TokenFreshAssign
- TokenBacktickString
- TokenComment
- TokenBeginBlockComment
- TokenEndBlockComment
- TokenSemicolon
- TokenSymbolColon
- TokenComma
- TokenEnd
- )
- type Token struct {
- typ TokenType
- str string
- }
- var EndTk = Token{typ: TokenEnd}
- func (t Token) String() string {
- switch t.typ {
- case TokenLParen:
- return "("
- case TokenRParen:
- return ")"
- case TokenLSquare:
- return "["
- case TokenRSquare:
- return "]"
- case TokenLCurly:
- return "{"
- case TokenRCurly:
- return "}"
- case TokenDot:
- return t.str
- case TokenQuote:
- return "'"
- case TokenBacktick:
- return "`"
- case TokenCaret:
- return "^"
- case TokenTilde:
- return "~"
- case TokenTildeAt:
- return "~@"
- case TokenHex:
- return "0x" + t.str
- case TokenOct:
- return "0o" + t.str
- case TokenBinary:
- return "0b" + t.str
- case TokenChar:
- return strconv.Quote(t.str)
- case TokenColonOperator:
- return ":"
- case TokenThreadingOperator:
- return "->"
- case TokenBackslash:
- return "\\"
- case TokenDollar:
- return "$"
- }
- return t.str
- }
- type LexerState int
- const (
- LexerNormal LexerState = iota
- LexerCommentLine //
- LexerStrLit //
- LexerStrEscaped //
- LexerUnquote //
- LexerBacktickString //
- LexerFreshAssignOrColon
- LexerFirstFwdSlash // could be start of // comment or /*
- LexerCommentBlock
- LexerCommentBlockAsterisk // could be end of block comment */
- LexerBuiltinOperator
- )
- type Lexer struct {
- parser *Parser
- state LexerState
- prevrune rune
- tokens []Token
- buffer *bytes.Buffer
- prevToken Token
- prevPrevToken Token
- stream io.RuneScanner
- next []io.RuneScanner
- linenum int
- }
- func (lexer *Lexer) AppendToken(tok Token) {
- lexer.tokens = append(lexer.tokens, tok)
- lexer.prevPrevToken = lexer.prevToken
- lexer.prevToken = tok
- }
- func (lexer *Lexer) PrependToken(tok Token) {
- lexer.tokens = append([]Token{tok}, lexer.tokens...)
- }
- func NewLexer(p *Parser) *Lexer {
- return &Lexer{
- parser: p,
- tokens: make([]Token, 0, 10),
- buffer: new(bytes.Buffer),
- state: LexerNormal,
- linenum: 1,
- }
- }
- func (lexer *Lexer) Linenum() int {
- return lexer.linenum
- }
- func (lex *Lexer) Reset() {
- lex.stream = nil
- lex.tokens = lex.tokens[:0]
- lex.state = LexerNormal
- lex.linenum = 1
- lex.buffer.Reset()
- }
- func (lex *Lexer) EmptyToken() Token {
- return Token{}
- }
- func (lex *Lexer) Token(typ TokenType, str string) Token {
- t := Token{
- typ: typ,
- str: str,
- }
- return t
- }
- var (
- BoolRegex = regexp.MustCompile("^(true|false)$")
- DecimalRegex = regexp.MustCompile("^-?[0-9]+$")
- HexRegex = regexp.MustCompile("^0x[0-9a-fA-F]+$")
- OctRegex = regexp.MustCompile("^0o[0-7]+$")
- BinaryRegex = regexp.MustCompile("^0b[01]+$")
- // SymbolRegex = regexp.MustCompile("^[^'#]+$")
- // (Sigil) symbols can begin with #, $, ?, but
- // sigils cannot appear later in any symbol.
- // Symbols cannot contain whitespace nor `~`, `@`, `(`, `)`, `[`, `]`,
- // `{`, `}`, `'`, `#`, `^`, `\`, `|`, `%`, `"`, `;`. They can optionally
- // end in `:`.
- // Nor, obviously, can symbols contain backticks, "`".
- // Symbols cannot start with a number. DotSymbols cannot have a number
- // as the first character after '.'
- SymbolRegex = regexp.MustCompile(`^[#$?]?[^#$?':;\\~@\[\]{}\^|"()%0-9,&][^'#:;\\~@\[\]{}\^|"()%,&*\-]*[:]?$`)
- // dot symbol examples: `.`, `.a`, `.a.b`, `.a.b.c`
- // dot symbol non-examples: `.a.`, `..`
- DotSymbolRegex = regexp.MustCompile(`^[.]$|^([.][^'#:;\\~@\[\]{}\^|"()%.0-9,][^'#:;\\~@\[\]{}\^|"()%.,*+\-]*)+$|^[^'#:;\\~@\[\]{}\^|"()%.0-9,][^'#:;\\~@\[\]{}\^|"()%.,*+\-]*([.][^'#:;\\~@\[\]{}\^|"()%.0-9,][^'#:;\\~@\[\]{}\^|"()%.,*+\-]*)+$`)
- DotPartsRegex = regexp.MustCompile(`[.]?[^'#:;\\~@\[\]{}\^|"()%.0-9,][^'#:;\\~@\[\]{}\^|"()%.,]*`)
- CharRegex = regexp.MustCompile("^'\\\\?.'$")
- FloatRegex = regexp.MustCompile("^-?([0-9]+\\.[0-9]*)$|-?(\\.[0-9]+)$|-?([0-9]+(\\.[0-9]*)?[eE](-?[0-9]+))$")
- ComplexRegex = regexp.MustCompile("^-?([0-9]+\\.[0-9]*)i?$|-?(\\.[0-9]+)i?$|-?([0-9]+(\\.[0-9]*)?[eE](-?[0-9]+))i?$")
- BuiltinOpRegex = regexp.MustCompile(`^(\+\+|\-\-|\+=|\-=|=|==|:=|\+|\-|\*|<|>|<=|>=|<-|->|\*=|/=|\*\*|!|!=|<!)$`)
- )
- func StringToRunes(str string) []rune {
- b := []byte(str)
- runes := make([]rune, 0)
- for len(b) > 0 {
- r, size := utf8.DecodeRune(b)
- runes = append(runes, r)
- b = b[size:]
- }
- return runes
- }
- func EscapeChar(char rune) (rune, error) {
- switch char {
- case 'n':
- return '\n', nil
- case 'r':
- return '\r', nil
- case 'a':
- return '\a', nil
- case 't':
- return '\t', nil
- case '\\':
- return '\\', nil
- case '"':
- return '"', nil
- case '\'':
- return '\'', nil
- case '#':
- return '#', nil
- }
- return ' ', errors.New("invalid escape sequence")
- }
- func DecodeChar(atom string) (string, error) {
- runes := StringToRunes(atom)
- n := len(runes)
- runes = runes[:n-1]
- runes = runes[1:]
- if len(runes) == 2 {
- char, err := EscapeChar(runes[1])
- return string(char), err
- }
- if len(runes) == 1 {
- return string(runes[0]), nil
- }
- return "", errors.New("not a char literal")
- }
- func (x *Lexer) DecodeAtom(atom string) (Token, error) {
- if atom == "&" {
- return x.Token(TokenSymbol, "&"), nil
- }
- if atom == "\\" {
- return x.Token(TokenBackslash, ""), nil
- }
- if BoolRegex.MatchString(atom) {
- return x.Token(TokenBool, atom), nil
- }
- if DecimalRegex.MatchString(atom) {
- return x.Token(TokenDecimal, atom), nil
- }
- if HexRegex.MatchString(atom) {
- return x.Token(TokenHex, atom[2:]), nil
- }
- if OctRegex.MatchString(atom) {
- return x.Token(TokenOct, atom[2:]), nil
- }
- if BinaryRegex.MatchString(atom) {
- return x.Token(TokenBinary, atom[2:]), nil
- }
- if FloatRegex.MatchString(atom) {
- return x.Token(TokenFloat, atom), nil
- }
- if DotSymbolRegex.MatchString(atom) {
- //Q("matched DotSymbolRegex '%v'", atom)
- return x.Token(TokenDotSymbol, atom), nil
- }
- if BuiltinOpRegex.MatchString(atom) {
- return x.Token(TokenSymbol, atom), nil
- }
- if atom == ":" {
- return x.Token(TokenSymbol, atom), nil
- } else if SymbolRegex.MatchString(atom) {
- ////Q("matched symbol regex, atom='%v'", atom)
- n := len(atom)
- if atom[n-1] == ':' {
- ////Q("matched symbol regex with colon, atom[:n-1]='%v'", atom[:n-1])
- return x.Token(TokenSymbolColon, atom[:n-1]), nil
- }
- return x.Token(TokenSymbol, atom), nil
- }
- if CharRegex.MatchString(atom) {
- char, err := DecodeChar(atom)
- if err != nil {
- return x.EmptyToken(), err
- }
- return x.Token(TokenChar, char), nil
- }
- return x.EmptyToken(), fmt.Errorf("Unrecognized atom: '%s'", atom)
- }
- func (lexer *Lexer) dumpBuffer() error {
- n := lexer.buffer.Len()
- if n <= 0 {
- return nil
- }
- tok, err := lexer.DecodeAtom(lexer.buffer.String())
- if err != nil {
- return err
- }
- lexer.buffer.Reset()
- lexer.AppendToken(tok)
- return nil
- }
- // with block comments, we've got to tell
- // the parser about them, so it can recognize
- // when another line is needed to finish a
- // block comment.
- func (lexer *Lexer) dumpComment() {
- str := lexer.buffer.String()
- lexer.buffer.Reset()
- lexer.AppendToken(lexer.Token(TokenComment, str))
- }
- func (lexer *Lexer) dumpString() {
- str := lexer.buffer.String()
- lexer.buffer.Reset()
- lexer.AppendToken(lexer.Token(TokenString, str))
- }
- func (lexer *Lexer) dumpBacktickString() {
- str := lexer.buffer.String()
- lexer.buffer.Reset()
- lexer.AppendToken(lexer.Token(TokenBacktickString, str))
- }
- func (x *Lexer) DecodeBrace(brace rune) Token {
- switch brace {
- case '(':
- return x.Token(TokenLParen, "")
- case ')':
- return x.Token(TokenRParen, "")
- case '[':
- return x.Token(TokenLSquare, "")
- case ']':
- return x.Token(TokenRSquare, "")
- case '{':
- return x.Token(TokenLCurly, "")
- case '}':
- return x.Token(TokenRCurly, "")
- }
- return EndTk
- }
- func (lexer *Lexer) LexNextRune(r rune) error {
- top:
- switch lexer.state {
- case LexerCommentBlock:
- //Q("lexer.state = LexerCommentBlock")
- if r == '\n' {
- _, err := lexer.buffer.WriteRune('\n')
- if err != nil {
- return err
- }
- lexer.dumpComment()
- // stay in LexerCommentBlock
- return nil
- }
- if r == '*' {
- lexer.state = LexerCommentBlockAsterisk
- return nil
- }
- case LexerCommentBlockAsterisk:
- //Q("lexer.state = LexerCommentBlockAsterisk")
- if r == '/' {
- _, err := lexer.buffer.WriteString("*/")
- if err != nil {
- return err
- }
- lexer.dumpComment()
- lexer.AppendToken(lexer.Token(TokenEndBlockComment, ""))
- lexer.state = LexerNormal
- return nil
- }
- _, err := lexer.buffer.WriteRune('*')
- if err != nil {
- return err
- }
- lexer.state = LexerCommentBlock
- goto writeRuneToBuffer
- case LexerFirstFwdSlash:
- //Q("lexer.state = LexerFirstFwdSlash")
- if r == '/' {
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.state = LexerCommentLine
- _, err = lexer.buffer.WriteString("//")
- return err
- }
- if r == '*' {
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- _, err = lexer.buffer.WriteString("/*")
- if err != nil {
- return err
- }
- lexer.state = LexerCommentBlock
- lexer.AppendToken(lexer.Token(TokenBeginBlockComment, ""))
- return nil
- }
- lexer.state = LexerBuiltinOperator
- lexer.prevrune = '/'
- err := lexer.dumpBuffer() // don't mix with token before the /
- if err != nil {
- return err
- }
- goto top // process the unknown rune r
- case LexerCommentLine:
- //Q("lexer.state = LexerCommentLine")
- if r == '\n' {
- //Q("lexer.state = LexerCommentLine sees end of line comment: '%s', going to LexerNormal", string(lexer.buffer.Bytes()))
- lexer.dumpComment()
- lexer.state = LexerNormal
- return nil
- }
- case LexerBacktickString:
- if r == '`' {
- lexer.dumpBacktickString()
- lexer.state = LexerNormal
- return nil
- }
- lexer.buffer.WriteRune(r)
- return nil
- case LexerStrLit:
- if r == '\\' {
- lexer.state = LexerStrEscaped
- return nil
- }
- if r == '"' {
- lexer.dumpString()
- lexer.state = LexerNormal
- return nil
- }
- lexer.buffer.WriteRune(r)
- return nil
- case LexerStrEscaped:
- char, err := EscapeChar(r)
- if err != nil {
- return err
- }
- lexer.buffer.WriteRune(char)
- lexer.state = LexerStrLit
- return nil
- case LexerUnquote:
- if r == '@' {
- lexer.AppendToken(lexer.Token(TokenTildeAt, ""))
- } else {
- lexer.AppendToken(lexer.Token(TokenTilde, ""))
- lexer.buffer.WriteRune(r)
- }
- lexer.state = LexerNormal
- return nil
- case LexerFreshAssignOrColon:
- lexer.state = LexerNormal
- // there was a ':' followed by either '=' or something other than '=',
- // so proceed to process the normal ':' actions.
- if lexer.buffer.Len() == 0 {
- if r == '=' {
- lexer.AppendToken(lexer.Token(TokenFreshAssign, ":="))
- return nil
- }
- }
- if r == '=' {
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.AppendToken(lexer.Token(TokenFreshAssign, ":="))
- return nil
- } else {
- // but still allow ':' to be a token terminator at the end of a word.
- _, err := lexer.buffer.WriteRune(':')
- if err != nil {
- return err
- }
- err = lexer.dumpBuffer()
- if err != nil {
- return err
- }
- goto top // process the unknown rune r in Normal mode
- }
- case LexerBuiltinOperator:
- //Q("in LexerBuiltinOperator")
- lexer.state = LexerNormal
- // three cases: negative number, one rune operator, two rune operator
- first := string(lexer.prevrune)
- atom := fmt.Sprintf("%c%c", lexer.prevrune, r)
- //Q("in LexerBuiltinOperator, first='%s', atom='%s'", first, atom)
- // are we a negative number -1 or -.1 rather than ->, --, -= operator?
- if lexer.prevrune == '-' {
- if FloatRegex.MatchString(atom) || DecimalRegex.MatchString(atom) {
- //Q("'%s' is the beginning of a negative number", atom)
- _, err := lexer.buffer.WriteString(atom)
- if err != nil {
- return err
- }
- return nil
- } else {
- //Q("atom was not matched by FloatRegex: '%s'", atom)
- }
- }
- if BuiltinOpRegex.MatchString(atom) {
- //Q("2 rune atom in builtin op '%s', first='%s'", atom, first)
- // 2 rune op
- lexer.AppendToken(lexer.Token(TokenSymbol, atom))
- return nil
- }
- //Q("1 rune atom in builtin op '%s', first='%s'", atom, first)
- lexer.AppendToken(lexer.Token(TokenSymbol, first))
- goto top // still have to parse r in normal
- case LexerNormal:
- switch r {
- case '*':
- fallthrough
- case '+':
- fallthrough
- case '-':
- fallthrough
- case '<':
- fallthrough
- case '>':
- fallthrough
- case '=':
- fallthrough
- case '!':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.state = LexerBuiltinOperator
- lexer.prevrune = r
- return nil
- case '/':
- lexer.state = LexerFirstFwdSlash
- return nil
- case '`':
- if lexer.buffer.Len() > 0 {
- return errors.New("Unexpected backtick")
- }
- lexer.state = LexerBacktickString
- return nil
- case '"':
- if lexer.buffer.Len() > 0 {
- return errors.New("Unexpected quote")
- }
- lexer.state = LexerStrLit
- return nil
- case ';':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.AppendToken(lexer.Token(TokenSemicolon, ";"))
- return nil
- case ',':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.AppendToken(lexer.Token(TokenComma, ","))
- return nil
- // colon terminates a keyword symbol, e.g. in `mykey: "myvalue"`;
- // mykey is the symbol.
- // Exception: unless it is the := operator for fresh assigment.
- case ':':
- lexer.state = LexerFreshAssignOrColon
- // won't know if it is ':' alone or ':=' for sure
- // until we get the next rune
- return nil
- // likewise &
- case '&':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.AppendToken(lexer.Token(TokenSymbol, "&"))
- return nil
- case '%': // replaces ' as our quote shorthand
- if lexer.buffer.Len() > 0 {
- return errors.New("Unexpected % quote")
- }
- lexer.AppendToken(lexer.Token(TokenQuote, ""))
- return nil
- // caret '^' replaces backtick '`' as the start of a macro template, so
- // we can use `` as in Go for verbatim strings (strings with newlines, etc).
- case '^':
- if lexer.buffer.Len() > 0 {
- return errors.New("Unexpected ^ caret")
- }
- lexer.AppendToken(lexer.Token(TokenCaret, ""))
- return nil
- case '~':
- if lexer.buffer.Len() > 0 {
- return errors.New("Unexpected tilde")
- }
- lexer.state = LexerUnquote
- return nil
- case '(':
- fallthrough
- case ')':
- fallthrough
- case '[':
- fallthrough
- case ']':
- fallthrough
- case '{':
- fallthrough
- case '}':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- lexer.AppendToken(lexer.DecodeBrace(r))
- return nil
- case '\n':
- lexer.linenum++
- fallthrough
- case ' ':
- fallthrough
- case '\t':
- fallthrough
- case '\r':
- err := lexer.dumpBuffer()
- if err != nil {
- return err
- }
- return nil
- } // end switch r in LexerNormal state
- } // end switch lexer.state
- writeRuneToBuffer:
- _, err := lexer.buffer.WriteRune(r)
- if err != nil {
- return err
- }
- return nil
- }
- // peekNextToken returns EndTk when it has nothing left.
- func (lexer *Lexer) peekNextToken() (tok Token, err error) {
- if lexer.stream == nil {
- if !lexer.PromoteNextStream() {
- return EndTk, nil
- }
- }
- for len(lexer.tokens) == 0 {
- r, _, err := lexer.stream.ReadRune()
- if err != nil {
- if lexer.PromoteNextStream() {
- continue
- } else {
- return EndTk, nil
- }
- }
- err = lexer.LexNextRune(r)
- if err != nil {
- return EndTk, err
- }
- }
- tok = lexer.tokens[0]
- return tok, nil
- }
- func (lexer *Lexer) GetNextToken() (tok Token, err error) {
- /*
- Q("\n in GetNextToken()\n")
- defer func() {
- Q("\n done with GetNextToken() -> returning tok='%v', err=%v. lexer.buffer.String()='%s'\n",
- tok, err, lexer.buffer.String())
- }()
- */
- tok, err = lexer.peekNextToken()
- if err != nil || tok.typ == TokenEnd {
- return EndTk, err
- }
- lexer.tokens = lexer.tokens[1:]
- return tok, nil
- }
- func (lex *Lexer) PromoteNextStream() (ok bool) {
- /*
- Q("entering PromoteNextStream()!\n")
- defer func() {
- Q("done with PromoteNextStream, promoted=%v\n", ok)
- }()
- */
- if len(lex.next) == 0 {
- return false
- }
- //Q("Promoting next stream!\n")
- lex.stream = lex.next[0]
- lex.next = lex.next[1:]
- return true
- }
- func (lex *Lexer) AddNextStream(s io.RuneScanner) {
- // in case we still have input available,
- // save new stuff for later
- lex.next = append(lex.next, s)
- if lex.stream == nil {
- lex.PromoteNextStream()
- } else {
- _, _, err := lex.stream.ReadRune()
- if err == nil {
- lex.stream.UnreadRune()
- // still have input available
- return
- } else {
- lex.PromoteNextStream()
- }
- }
- }
|