lexer.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. package lexer
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. type Pos int
  9. type Item struct {
  10. Type ItemType
  11. Pos Pos
  12. Value string
  13. }
  14. type ItemType int
  15. const (
  16. ItemError ItemType = iota
  17. ItemEOF
  18. ItemLeftParen
  19. ItemRightParen
  20. ItemLeftVect
  21. ItemRightVect
  22. ItemIdent
  23. ItemString
  24. ItemChar
  25. ItemFloat
  26. ItemInt
  27. ItemComplex
  28. ItemQuote
  29. ItemQuasiQuote
  30. ItemUnquote
  31. ItemUnquoteSplice
  32. )
  33. const EOF = -1
  34. type stateFn func(*Lexer) stateFn
  35. type Lexer struct {
  36. name string
  37. input string
  38. state stateFn
  39. pos Pos
  40. start Pos
  41. width Pos
  42. lastPos Pos
  43. items chan Item
  44. parenDepth int
  45. vectDepth int
  46. }
  47. // next returns the next rune in the input.
  48. func (l *Lexer) next() rune {
  49. if int(l.pos) >= len(l.input) {
  50. l.width = 0
  51. return EOF
  52. }
  53. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  54. l.width = Pos(w)
  55. l.pos += l.width
  56. return r
  57. }
  58. // peek returns but does not consume the next rune in the input.
  59. func (l *Lexer) peek() rune {
  60. r := l.next()
  61. l.backup()
  62. return r
  63. }
  64. // backup steps back one rune. Can only be called once per call of next.
  65. func (l *Lexer) backup() {
  66. l.pos -= l.width
  67. }
  68. // emit passes an Item back to the client.
  69. func (l *Lexer) emit(t ItemType) {
  70. l.items <- Item{t, l.start, l.input[l.start:l.pos]}
  71. l.start = l.pos
  72. }
  73. func (l *Lexer) ignore() {
  74. l.start = l.pos
  75. }
  76. // accept consumes the next rune if it's from the valid set.
  77. func (l *Lexer) accept(valid string) bool {
  78. if strings.IndexRune(valid, l.next()) >= 0 {
  79. return true
  80. }
  81. l.backup()
  82. return false
  83. }
  84. // acceptRun consumes a run of runes from the valid set.
  85. func (l *Lexer) acceptRun(valid string) {
  86. for strings.IndexRune(valid, l.next()) >= 0 {
  87. }
  88. l.backup()
  89. }
  90. func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
  91. l.items <- Item{ItemError, l.start, fmt.Sprintf(format, args...)}
  92. return nil
  93. }
  94. func (l *Lexer) NextItem() Item {
  95. item := <-l.items
  96. l.lastPos = item.Pos
  97. return item
  98. }
  99. func Lex(name, input string) *Lexer {
  100. l := &Lexer{
  101. name: name,
  102. input: input,
  103. items: make(chan Item),
  104. }
  105. go l.run()
  106. return l
  107. }
  108. func (l *Lexer) run() {
  109. for l.state = lexWhitespace; l.state != nil; {
  110. l.state = l.state(l)
  111. }
  112. close(l.items)
  113. }
  114. func lexLeftVect(l *Lexer) stateFn {
  115. l.emit(ItemLeftVect)
  116. return lexWhitespace
  117. }
  118. func lexRightVect(l *Lexer) stateFn {
  119. l.emit(ItemRightVect)
  120. return lexWhitespace
  121. }
  122. // lexes an open parenthesis
  123. func lexLeftParen(l *Lexer) stateFn {
  124. l.emit(ItemLeftParen)
  125. return lexWhitespace
  126. }
  127. func lexWhitespace(l *Lexer) stateFn {
  128. for r := l.next(); isSpace(r) || r == '\n'; l.next() {
  129. r = l.peek()
  130. }
  131. l.backup()
  132. l.ignore()
  133. switch r := l.next(); {
  134. case r == EOF:
  135. l.emit(ItemEOF)
  136. return nil
  137. case r == '(':
  138. return lexLeftParen
  139. case r == ')':
  140. return lexRightParen
  141. case r == '[':
  142. return lexLeftVect
  143. case r == ']':
  144. return lexRightVect
  145. case r == '"':
  146. return lexString
  147. case r == '+' || r == '-' || ('0' <= r && r <= '9'):
  148. return lexNumber
  149. case r == ';':
  150. return lexComment
  151. case isAlphaNumeric(r):
  152. return lexIdentifier
  153. default:
  154. panic(fmt.Sprintf("don't know what to do with: %q", r))
  155. }
  156. }
  157. func lexString(l *Lexer) stateFn {
  158. for r := l.next(); r != '"'; r = l.next() {
  159. if r == '\\' {
  160. r = l.next()
  161. }
  162. if r == EOF {
  163. return l.errorf("unterminated quoted string")
  164. }
  165. }
  166. l.emit(ItemString)
  167. return lexWhitespace
  168. }
  169. func lexIdentifier(l *Lexer) stateFn {
  170. for r := l.next(); isAlphaNumeric(r); r = l.next() {
  171. }
  172. l.backup()
  173. l.emit(ItemIdent)
  174. return lexWhitespace
  175. }
  176. // lex a close parenthesis
  177. func lexRightParen(l *Lexer) stateFn {
  178. l.emit(ItemRightParen)
  179. return lexWhitespace
  180. }
  181. // lex a comment, comment delimiter is known to be already read
  182. func lexComment(l *Lexer) stateFn {
  183. i := strings.Index(l.input[l.pos:], "\n")
  184. l.pos += Pos(i)
  185. l.ignore()
  186. return lexWhitespace
  187. }
  188. func lexNumber(l *Lexer) stateFn {
  189. if !l.scanNumber() {
  190. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  191. }
  192. if l.start+1 == l.pos {
  193. return lexIdentifier
  194. }
  195. if sign := l.peek(); sign == '+' || sign == '-' {
  196. // Complex: 1+2i. No spaces, must end in 'i'.
  197. if !l.scanNumber() || l.input[l.pos-1] != 'i' {
  198. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  199. }
  200. l.emit(ItemComplex)
  201. } else if strings.ContainsRune(l.input[l.start:l.pos], '.') {
  202. l.emit(ItemFloat)
  203. } else {
  204. l.emit(ItemInt)
  205. }
  206. return lexWhitespace
  207. }
  208. func (l *Lexer) scanNumber() bool {
  209. // Optional leading sign.
  210. l.accept("+-")
  211. // Is it hex?
  212. digits := "0123456789"
  213. if l.accept("0") && l.accept("xX") {
  214. digits = "0123456789abcdefABCDEF"
  215. }
  216. l.acceptRun(digits)
  217. if l.accept(".") {
  218. l.acceptRun(digits)
  219. }
  220. if l.accept("eE") {
  221. l.accept("+-")
  222. l.acceptRun("0123456789")
  223. }
  224. // Is it imaginary?
  225. l.accept("i")
  226. // Next thing mustn't be alphanumeric.
  227. if r := l.peek(); isAlphaNumeric(r) {
  228. l.next()
  229. return false
  230. }
  231. return true
  232. }
  233. // isSpace reports whether r is a space character.
  234. func isSpace(r rune) bool {
  235. return r == ' ' || r == '\t'
  236. }
  237. // isEndOfLine reports whether r is an end-of-line character.
  238. func isEndOfLine(r rune) bool {
  239. return r == '\r' || r == '\n'
  240. }
  241. // isAlphaNumeric reports whether r is a valid rune for an identifier.
  242. func isAlphaNumeric(r rune) bool {
  243. return r == '>' || r == '<' || r == '=' || r == '-' || r == '+' || r == '*' || r == '&' || r == '_' || r == '/' || unicode.IsLetter(r) || unicode.IsDigit(r)
  244. }
  245. func debug(msg string) {
  246. fmt.Println(msg)
  247. }