lexer.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. package lexer
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. type Pos int
  9. type Item struct {
  10. Type ItemType
  11. Pos Pos
  12. Value string
  13. }
  14. type ItemType int
  15. const (
  16. ItemError ItemType = iota
  17. ItemEOF
  18. ItemLeftParen
  19. ItemRightParen
  20. ItemLeftVect
  21. ItemRightVect
  22. ItemIdent
  23. ItemString
  24. ItemChar
  25. ItemFloat
  26. ItemInt
  27. ItemComplex
  28. ItemQuote
  29. ItemQuasiQuote
  30. ItemUnquote
  31. ItemUnquoteSplice
  32. )
  33. const EOF = -1
  34. type stateFn func(*Lexer) stateFn
  35. type Lexer struct {
  36. name string
  37. input string
  38. state stateFn
  39. pos Pos
  40. start Pos
  41. width Pos
  42. lastPos Pos
  43. items chan Item
  44. parenDepth int
  45. vectDepth int
  46. }
  47. // next returns the next rune in the input.
  48. func (l *Lexer) next() rune {
  49. if int(l.pos) >= len(l.input) {
  50. l.width = 0
  51. return EOF
  52. }
  53. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  54. l.width = Pos(w)
  55. l.pos += l.width
  56. return r
  57. }
  58. // peek returns but does not consume the next rune in the input.
  59. func (l *Lexer) peek() rune {
  60. r := l.next()
  61. l.backup()
  62. return r
  63. }
  64. // backup steps back one rune. Can only be called once per call of next.
  65. func (l *Lexer) backup() {
  66. l.pos -= l.width
  67. }
  68. // emit passes an Item back to the client.
  69. func (l *Lexer) emit(t ItemType) {
  70. l.items <- Item{t, l.start, l.input[l.start:l.pos]}
  71. l.start = l.pos
  72. }
  73. func (l *Lexer) ignore() {
  74. l.start = l.pos
  75. }
  76. // accept consumes the next rune if it's from the valid set.
  77. func (l *Lexer) accept(valid string) bool {
  78. if strings.IndexRune(valid, l.next()) >= 0 {
  79. return true
  80. }
  81. l.backup()
  82. return false
  83. }
  84. // acceptRun consumes a run of runes from the valid set.
  85. func (l *Lexer) acceptRun(valid string) {
  86. for strings.IndexRune(valid, l.next()) >= 0 {
  87. }
  88. l.backup()
  89. }
  90. func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
  91. l.items <- Item{ItemError, l.start, fmt.Sprintf(format, args...)}
  92. return nil
  93. }
  94. func (l *Lexer) NextItem() Item {
  95. item := <-l.items
  96. l.lastPos = item.Pos
  97. return item
  98. }
  99. func Lex(name, input string) *Lexer {
  100. l := &Lexer{
  101. name: name,
  102. input: input,
  103. items: make(chan Item),
  104. }
  105. go l.run()
  106. return l
  107. }
  108. func (l *Lexer) run() {
  109. for l.state = lexWhitespace; l.state != nil; {
  110. l.state = l.state(l)
  111. }
  112. close(l.items)
  113. }
  114. func lexLeftVect(l *Lexer) stateFn {
  115. l.emit(ItemLeftVect)
  116. return lexWhitespace
  117. }
  118. func lexRightVect(l *Lexer) stateFn {
  119. l.emit(ItemRightVect)
  120. return lexWhitespace
  121. }
  122. // lexes an open parenthesis
  123. func lexLeftParen(l *Lexer) stateFn {
  124. l.emit(ItemLeftParen)
  125. return lexWhitespace
  126. }
  127. func lexWhitespace(l *Lexer) stateFn {
  128. for r := l.next(); isSpace(r) || r == '\n'; l.next() {
  129. r = l.peek()
  130. }
  131. l.backup()
  132. l.ignore()
  133. switch r := l.next(); {
  134. case r == EOF:
  135. l.emit(ItemEOF)
  136. return nil
  137. case r == '(':
  138. return lexLeftParen
  139. case r == ')':
  140. return lexRightParen
  141. case r == '[':
  142. return lexLeftVect
  143. case r == ']':
  144. return lexRightVect
  145. case r == '"':
  146. return lexString
  147. case r == '+' || r == '-' || ('0' <= r && r <= '9'):
  148. return lexNumber
  149. case r == ';':
  150. return lexComment
  151. case isAlphaNumeric(r):
  152. return lexIdentifier
  153. default:
  154. panic(fmt.Sprintf("don't know what to do with: %q", r))
  155. }
  156. }
  157. func lexString(l *Lexer) stateFn {
  158. Loop:
  159. for {
  160. switch l.next() {
  161. case '\\':
  162. if r := l.next(); r != EOF {
  163. break
  164. }
  165. fallthrough
  166. case EOF:
  167. return l.errorf("unterminated quoted string")
  168. case '"':
  169. break Loop
  170. }
  171. }
  172. l.emit(ItemString)
  173. return lexWhitespace
  174. }
  175. func lexIdentifier(l *Lexer) stateFn {
  176. Loop:
  177. for {
  178. switch r := l.next(); {
  179. case isAlphaNumeric(r):
  180. // absorb it!
  181. default:
  182. l.backup()
  183. break Loop
  184. }
  185. }
  186. l.emit(ItemIdent)
  187. return lexWhitespace
  188. }
  189. // lex a close parenthesis
  190. func lexRightParen(l *Lexer) stateFn {
  191. l.emit(ItemRightParen)
  192. return lexWhitespace
  193. }
  194. // lex a comment, comment delimiter is known to be already read
  195. func lexComment(l *Lexer) stateFn {
  196. i := strings.Index(l.input[l.pos:], "\n")
  197. l.pos += Pos(i)
  198. l.ignore()
  199. return lexWhitespace
  200. }
  201. func lexNumber(l *Lexer) stateFn {
  202. if !l.scanNumber() {
  203. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  204. }
  205. if sign := l.peek(); sign == '+' || sign == '-' {
  206. // Complex: 1+2i. No spaces, must end in 'i'.
  207. if !l.scanNumber() || l.input[l.pos-1] != 'i' {
  208. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  209. }
  210. l.emit(ItemComplex)
  211. } else if strings.ContainsRune(l.input[l.start:l.pos], '.') {
  212. l.emit(ItemFloat)
  213. } else {
  214. l.emit(ItemInt)
  215. }
  216. return lexWhitespace
  217. }
  218. func (l *Lexer) scanNumber() bool {
  219. // Optional leading sign.
  220. l.accept("+-")
  221. // Is it hex?
  222. digits := "0123456789"
  223. if l.accept("0") && l.accept("xX") {
  224. digits = "0123456789abcdefABCDEF"
  225. }
  226. l.acceptRun(digits)
  227. if l.accept(".") {
  228. l.acceptRun(digits)
  229. }
  230. if l.accept("eE") {
  231. l.accept("+-")
  232. l.acceptRun("0123456789")
  233. }
  234. // Is it imaginary?
  235. l.accept("i")
  236. // Next thing mustn't be alphanumeric.
  237. if r := l.peek(); isAlphaNumeric(r) {
  238. l.next()
  239. return false
  240. }
  241. return true
  242. }
  243. // isSpace reports whether r is a space character.
  244. func isSpace(r rune) bool {
  245. return r == ' ' || r == '\t'
  246. }
  247. // isEndOfLine reports whether r is an end-of-line character.
  248. func isEndOfLine(r rune) bool {
  249. return r == '\r' || r == '\n'
  250. }
  251. // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
  252. func isAlphaNumeric(r rune) bool {
  253. return r == '-' || r == ':' || r == '/' || unicode.IsLetter(r) || unicode.IsDigit(r)
  254. }
  255. func debug(msg string) {
  256. fmt.Println(msg)
  257. }