lexer.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. package glisp
  2. import (
  3. "bytes"
  4. "errors"
  5. "io"
  6. "regexp"
  7. "strconv"
  8. "unicode/utf8"
  9. )
  10. type TokenType int
  11. const (
  12. TokenLParen TokenType = iota
  13. TokenRParen
  14. TokenLSquare
  15. TokenRSquare
  16. TokenLCurly
  17. TokenRCurly
  18. TokenDot
  19. TokenQuote
  20. TokenBacktick
  21. TokenTilde
  22. TokenTildeAt
  23. TokenSymbol
  24. TokenBool
  25. TokenDecimal
  26. TokenHex
  27. TokenOct
  28. TokenBinary
  29. TokenFloat
  30. TokenChar
  31. TokenString
  32. TokenEnd
  33. )
  34. type Token struct {
  35. typ TokenType
  36. str string
  37. }
  38. func (t Token) String() string {
  39. switch t.typ {
  40. case TokenLParen:
  41. return "("
  42. case TokenRParen:
  43. return ")"
  44. case TokenLSquare:
  45. return "["
  46. case TokenRSquare:
  47. return "]"
  48. case TokenLCurly:
  49. return "{"
  50. case TokenRCurly:
  51. return "}"
  52. case TokenDot:
  53. return "."
  54. case TokenQuote:
  55. return "'"
  56. case TokenBacktick:
  57. return "`"
  58. case TokenTilde:
  59. return "~"
  60. case TokenTildeAt:
  61. return "~@"
  62. case TokenHex:
  63. return "0x" + t.str
  64. case TokenOct:
  65. return "0o" + t.str
  66. case TokenBinary:
  67. return "0b" + t.str
  68. case TokenChar:
  69. quoted := strconv.Quote(t.str)
  70. return "#" + quoted[1:len(quoted)-1]
  71. }
  72. return t.str
  73. }
  74. type LexerState int
  75. const (
  76. LexerNormal LexerState = iota
  77. LexerComment
  78. LexerStrLit
  79. LexerStrEscaped
  80. LexerUnquote
  81. )
  82. type Lexer struct {
  83. state LexerState
  84. tokens []Token
  85. buffer *bytes.Buffer
  86. stream io.RuneReader
  87. linenum int
  88. finished bool
  89. }
  90. var (
  91. BoolRegex = regexp.MustCompile("^(true|false)$")
  92. DecimalRegex = regexp.MustCompile("^-?[0-9]+$")
  93. HexRegex = regexp.MustCompile("^0x[0-9a-fA-F]+$")
  94. OctRegex = regexp.MustCompile("^0o[0-7]+$")
  95. BinaryRegex = regexp.MustCompile("^0b[01]+$")
  96. SymbolRegex = regexp.MustCompile("^[^'#]+$")
  97. CharRegex = regexp.MustCompile("^#\\\\?.$")
  98. FloatRegex = regexp.MustCompile("^-?([0-9]+\\.[0-9]*)|(\\.[0-9]+)|([0-9]+(\\.[0-9]*)?[eE](-?[0-9]+))$")
  99. )
  100. func StringToRunes(str string) []rune {
  101. b := []byte(str)
  102. runes := make([]rune, 0)
  103. for len(b) > 0 {
  104. r, size := utf8.DecodeRune(b)
  105. runes = append(runes, r)
  106. b = b[size:]
  107. }
  108. return runes
  109. }
  110. func EscapeChar(char rune) (rune, error) {
  111. switch char {
  112. case 'n':
  113. return '\n', nil
  114. case 'r':
  115. return '\r', nil
  116. case 'a':
  117. return '\a', nil
  118. case 't':
  119. return '\t', nil
  120. case '\\':
  121. return '\\', nil
  122. case '"':
  123. return '"', nil
  124. case '\'':
  125. return '\'', nil
  126. case '#':
  127. return '#', nil
  128. }
  129. return ' ', errors.New("invalid escape sequence")
  130. }
  131. func DecodeChar(atom string) (string, error) {
  132. runes := StringToRunes(atom)
  133. if len(runes) == 3 {
  134. char, err := EscapeChar(runes[2])
  135. return string(char), err
  136. }
  137. if len(runes) == 2 {
  138. return string(runes[1:2]), nil
  139. }
  140. return "", errors.New("not a char literal")
  141. }
  142. func DecodeAtom(atom string) (Token, error) {
  143. if atom == "." {
  144. return Token{TokenDot, ""}, nil
  145. }
  146. if BoolRegex.MatchString(atom) {
  147. return Token{TokenBool, atom}, nil
  148. }
  149. if DecimalRegex.MatchString(atom) {
  150. return Token{TokenDecimal, atom}, nil
  151. }
  152. if HexRegex.MatchString(atom) {
  153. return Token{TokenHex, atom[2:]}, nil
  154. }
  155. if OctRegex.MatchString(atom) {
  156. return Token{TokenOct, atom[2:]}, nil
  157. }
  158. if BinaryRegex.MatchString(atom) {
  159. return Token{TokenBinary, atom[2:]}, nil
  160. }
  161. if FloatRegex.MatchString(atom) {
  162. return Token{TokenFloat, atom}, nil
  163. }
  164. if SymbolRegex.MatchString(atom) {
  165. return Token{TokenSymbol, atom}, nil
  166. }
  167. if CharRegex.MatchString(atom) {
  168. char, err := DecodeChar(atom)
  169. if err != nil {
  170. return Token{}, err
  171. }
  172. return Token{TokenChar, char}, nil
  173. }
  174. return Token{}, errors.New("Unrecognized atom")
  175. }
  176. func (lexer *Lexer) dumpBuffer() error {
  177. if lexer.buffer.Len() <= 0 {
  178. return nil
  179. }
  180. tok, err := DecodeAtom(lexer.buffer.String())
  181. if err != nil {
  182. return err
  183. }
  184. lexer.buffer.Reset()
  185. lexer.tokens = append(lexer.tokens, tok)
  186. return nil
  187. }
  188. func (lexer *Lexer) dumpString() {
  189. str := lexer.buffer.String()
  190. lexer.buffer.Reset()
  191. lexer.tokens = append(lexer.tokens, Token{TokenString, str})
  192. }
  193. func DecodeBrace(brace rune) Token {
  194. switch brace {
  195. case '(':
  196. return Token{TokenLParen, ""}
  197. case ')':
  198. return Token{TokenRParen, ""}
  199. case '[':
  200. return Token{TokenLSquare, ""}
  201. case ']':
  202. return Token{TokenRSquare, ""}
  203. case '{':
  204. return Token{TokenLCurly, ""}
  205. case '}':
  206. return Token{TokenRCurly, ""}
  207. }
  208. return Token{TokenEnd, ""}
  209. }
  210. func (lexer *Lexer) LexNextRune(r rune) error {
  211. if lexer.state == LexerComment {
  212. if r == '\n' {
  213. lexer.state = LexerNormal
  214. }
  215. return nil
  216. }
  217. if lexer.state == LexerStrLit {
  218. if r == '\\' {
  219. lexer.state = LexerStrEscaped
  220. return nil
  221. }
  222. if r == '"' {
  223. lexer.dumpString()
  224. lexer.state = LexerNormal
  225. return nil
  226. }
  227. lexer.buffer.WriteRune(r)
  228. return nil
  229. }
  230. if lexer.state == LexerStrEscaped {
  231. char, err := EscapeChar(r)
  232. if err != nil {
  233. return err
  234. }
  235. lexer.buffer.WriteRune(char)
  236. lexer.state = LexerStrLit
  237. return nil
  238. }
  239. if lexer.state == LexerUnquote {
  240. if r == '@' {
  241. lexer.tokens = append(
  242. lexer.tokens, Token{TokenTildeAt, ""})
  243. } else {
  244. lexer.tokens = append(
  245. lexer.tokens, Token{TokenTilde, ""})
  246. lexer.buffer.WriteRune(r)
  247. }
  248. lexer.state = LexerNormal
  249. return nil
  250. }
  251. if r == '"' {
  252. if lexer.buffer.Len() > 0 {
  253. return errors.New("Unexpected quote")
  254. }
  255. lexer.state = LexerStrLit
  256. return nil
  257. }
  258. if r == ';' {
  259. lexer.state = LexerComment
  260. return nil
  261. }
  262. if r == '\'' {
  263. if lexer.buffer.Len() > 0 {
  264. return errors.New("Unexpected quote")
  265. }
  266. lexer.tokens = append(lexer.tokens, Token{TokenQuote, ""})
  267. return nil
  268. }
  269. if r == '`' {
  270. if lexer.buffer.Len() > 0 {
  271. return errors.New("Unexpected backtick")
  272. }
  273. lexer.tokens = append(lexer.tokens, Token{TokenBacktick, ""})
  274. return nil
  275. }
  276. if r == '~' {
  277. if lexer.buffer.Len() > 0 {
  278. return errors.New("Unexpected tilde")
  279. }
  280. lexer.state = LexerUnquote
  281. return nil
  282. }
  283. if r == '(' || r == ')' || r == '[' || r == ']' || r == '{' || r == '}' {
  284. err := lexer.dumpBuffer()
  285. if err != nil {
  286. return err
  287. }
  288. lexer.tokens = append(lexer.tokens, DecodeBrace(r))
  289. return nil
  290. }
  291. if r == ' ' || r == '\n' || r == '\t' || r == '\r' {
  292. if r == '\n' {
  293. lexer.linenum++
  294. }
  295. err := lexer.dumpBuffer()
  296. if err != nil {
  297. return err
  298. }
  299. return nil
  300. }
  301. _, err := lexer.buffer.WriteRune(r)
  302. if err != nil {
  303. return err
  304. }
  305. return nil
  306. }
  307. func (lexer *Lexer) PeekNextToken() (Token, error) {
  308. if lexer.finished {
  309. return Token{TokenEnd, ""}, nil
  310. }
  311. for len(lexer.tokens) == 0 {
  312. r, _, err := lexer.stream.ReadRune()
  313. if err != nil {
  314. lexer.finished = true
  315. if lexer.buffer.Len() > 0 {
  316. lexer.dumpBuffer()
  317. return lexer.tokens[0], nil
  318. }
  319. return Token{TokenEnd, ""}, nil
  320. }
  321. err = lexer.LexNextRune(r)
  322. if err != nil {
  323. return Token{TokenEnd, ""}, err
  324. }
  325. }
  326. tok := lexer.tokens[0]
  327. return tok, nil
  328. }
  329. func (lexer *Lexer) GetNextToken() (Token, error) {
  330. tok, err := lexer.PeekNextToken()
  331. if err != nil || tok.typ == TokenEnd {
  332. return Token{TokenEnd, ""}, err
  333. }
  334. lexer.tokens = lexer.tokens[1:]
  335. return tok, nil
  336. }
  337. func NewLexerFromStream(stream io.RuneReader) *Lexer {
  338. return &Lexer{
  339. tokens: make([]Token, 0, 10),
  340. buffer: new(bytes.Buffer),
  341. state: LexerNormal,
  342. stream: stream,
  343. linenum: 1,
  344. finished: false,
  345. }
  346. }
  347. func (lexer *Lexer) Linenum() int {
  348. return lexer.linenum
  349. }