123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- /*
- Copyright 2012 Google Inc. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package shlex
- /*
- Package shlex implements a simple lexer which splits input in to tokens using
- shell-style rules for quoting and commenting.
- */
- import (
- "bufio"
- "errors"
- "fmt"
- "io"
- "strings"
- )
- /*
- A TokenType is a top-level token; a word, space, comment, unknown.
- */
- type TokenType int
- /*
- A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape.
- */
- type RuneTokenType int
- type lexerState int
- type Token struct {
- tokenType TokenType
- value string
- }
- /*
- Two tokens are equal if both their types and values are equal. A nil token can
- never equal another token.
- */
- func (a *Token) Equal(b *Token) bool {
- if a == nil || b == nil {
- return false
- }
- if a.tokenType != b.tokenType {
- return false
- }
- return a.value == b.value
- }
- const (
- RUNE_CHAR string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,/@$*()+=><:;&^%~|!?[]{}"
- RUNE_SPACE string = " \t\r\n"
- RUNE_ESCAPING_QUOTE string = "\""
- RUNE_NONESCAPING_QUOTE string = "'"
- RUNE_ESCAPE = "\\"
- RUNE_COMMENT = "#"
- RUNETOKEN_UNKNOWN RuneTokenType = 0
- RUNETOKEN_CHAR RuneTokenType = 1
- RUNETOKEN_SPACE RuneTokenType = 2
- RUNETOKEN_ESCAPING_QUOTE RuneTokenType = 3
- RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4
- RUNETOKEN_ESCAPE RuneTokenType = 5
- RUNETOKEN_COMMENT RuneTokenType = 6
- RUNETOKEN_EOF RuneTokenType = 7
- TOKEN_UNKNOWN TokenType = 0
- TOKEN_WORD TokenType = 1
- TOKEN_SPACE TokenType = 2
- TOKEN_COMMENT TokenType = 3
- STATE_START lexerState = 0
- STATE_INWORD lexerState = 1
- STATE_ESCAPING lexerState = 2
- STATE_ESCAPING_QUOTED lexerState = 3
- STATE_QUOTED_ESCAPING lexerState = 4
- STATE_QUOTED lexerState = 5
- STATE_COMMENT lexerState = 6
- INITIAL_TOKEN_CAPACITY int = 100
- )
- /*
- A type for classifying characters. This allows for different sorts of
- classifiers - those accepting extended non-ascii chars, or strict posix
- compatibility, for example.
- */
- type TokenClassifier struct {
- typeMap map[int32]RuneTokenType
- }
- func addRuneClass(typeMap *map[int32]RuneTokenType, runes string, tokenType RuneTokenType) {
- for _, rune := range runes {
- (*typeMap)[int32(rune)] = tokenType
- }
- }
- /*
- Create a new classifier for basic ASCII characters.
- */
- func NewDefaultClassifier() *TokenClassifier {
- typeMap := map[int32]RuneTokenType{}
- addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR)
- addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE)
- addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE)
- addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE)
- addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE)
- addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT)
- return &TokenClassifier{
- typeMap: typeMap}
- }
- func (classifier *TokenClassifier) ClassifyRune(rune int32) RuneTokenType {
- return classifier.typeMap[rune]
- }
- /*
- A type for turning an input stream in to a sequence of strings. Whitespace and
- comments are skipped.
- */
- type Lexer struct {
- tokenizer *Tokenizer
- }
- /*
- Create a new lexer.
- */
- func NewLexer(r io.Reader) (*Lexer, error) {
- tokenizer, err := NewTokenizer(r)
- if err != nil {
- return nil, err
- }
- lexer := &Lexer{tokenizer: tokenizer}
- return lexer, nil
- }
- /*
- Return the next word, and an error value. If there are no more words, the error
- will be io.EOF.
- */
- func (l *Lexer) NextWord() (string, error) {
- var token *Token
- var err error
- for {
- token, err = l.tokenizer.NextToken()
- if err != nil {
- return "", err
- }
- switch token.tokenType {
- case TOKEN_WORD:
- {
- return token.value, nil
- }
- case TOKEN_COMMENT:
- {
- // skip comments
- }
- default:
- {
- panic(fmt.Sprintf("Unknown token type: %v", token.tokenType))
- }
- }
- }
- return "", io.EOF
- }
- /*
- A type for turning an input stream in to a sequence of typed tokens.
- */
- type Tokenizer struct {
- input *bufio.Reader
- classifier *TokenClassifier
- }
- /*
- Create a new tokenizer.
- */
- func NewTokenizer(r io.Reader) (*Tokenizer, error) {
- input := bufio.NewReader(r)
- classifier := NewDefaultClassifier()
- tokenizer := &Tokenizer{
- input: input,
- classifier: classifier}
- return tokenizer, nil
- }
- /*
- Scan the stream for the next token.
- This uses an internal state machine. It will panic if it encounters a character
- which it does not know how to handle.
- */
- func (t *Tokenizer) scanStream() (*Token, error) {
- state := STATE_START
- var tokenType TokenType
- value := make([]int32, 0, INITIAL_TOKEN_CAPACITY)
- var (
- nextRune int32
- nextRuneType RuneTokenType
- err error
- )
- SCAN:
- for {
- nextRune, _, err = t.input.ReadRune()
- nextRuneType = t.classifier.ClassifyRune(nextRune)
- if err != nil {
- if err == io.EOF {
- nextRuneType = RUNETOKEN_EOF
- err = nil
- } else {
- return nil, err
- }
- }
- switch state {
- case STATE_START: // no runes read yet
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- return nil, io.EOF
- }
- case RUNETOKEN_CHAR:
- {
- tokenType = TOKEN_WORD
- value = append(value, nextRune)
- state = STATE_INWORD
- }
- case RUNETOKEN_SPACE:
- {
- }
- case RUNETOKEN_ESCAPING_QUOTE:
- {
- tokenType = TOKEN_WORD
- state = STATE_QUOTED_ESCAPING
- }
- case RUNETOKEN_NONESCAPING_QUOTE:
- {
- tokenType = TOKEN_WORD
- state = STATE_QUOTED
- }
- case RUNETOKEN_ESCAPE:
- {
- tokenType = TOKEN_WORD
- state = STATE_ESCAPING
- }
- case RUNETOKEN_COMMENT:
- {
- tokenType = TOKEN_COMMENT
- state = STATE_COMMENT
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Unknown rune: %v", nextRune))
- }
- }
- }
- case STATE_INWORD: // in a regular word
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_COMMENT:
- {
- value = append(value, nextRune)
- }
- case RUNETOKEN_SPACE:
- {
- t.input.UnreadRune()
- break SCAN
- }
- case RUNETOKEN_ESCAPING_QUOTE:
- {
- state = STATE_QUOTED_ESCAPING
- }
- case RUNETOKEN_NONESCAPING_QUOTE:
- {
- state = STATE_QUOTED
- }
- case RUNETOKEN_ESCAPE:
- {
- state = STATE_ESCAPING
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- case STATE_ESCAPING: // the next rune after an escape character
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- err = errors.New("EOF found after escape character")
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
- {
- state = STATE_INWORD
- value = append(value, nextRune)
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- err = errors.New("EOF found after escape character")
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
- {
- state = STATE_QUOTED_ESCAPING
- value = append(value, nextRune)
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- case STATE_QUOTED_ESCAPING: // in escaping double quotes
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- err = errors.New("EOF found when expecting closing quote.")
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT:
- {
- value = append(value, nextRune)
- }
- case RUNETOKEN_ESCAPING_QUOTE:
- {
- state = STATE_INWORD
- }
- case RUNETOKEN_ESCAPE:
- {
- state = STATE_ESCAPING_QUOTED
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- case STATE_QUOTED: // in non-escaping single quotes
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- err = errors.New("EOF found when expecting closing quote.")
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
- {
- value = append(value, nextRune)
- }
- case RUNETOKEN_NONESCAPING_QUOTE:
- {
- state = STATE_INWORD
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- case STATE_COMMENT:
- {
- switch nextRuneType {
- case RUNETOKEN_EOF:
- {
- break SCAN
- }
- case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE:
- {
- value = append(value, nextRune)
- }
- case RUNETOKEN_SPACE:
- {
- if nextRune == '\n' {
- state = STATE_START
- break SCAN
- } else {
- value = append(value, nextRune)
- }
- }
- default:
- {
- return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
- }
- }
- }
- default:
- {
- panic(fmt.Sprintf("Unexpected state: %v", state))
- }
- }
- }
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- /*
- Return the next token in the stream, and an error value. If there are no more
- tokens available, the error value will be io.EOF.
- */
- func (t *Tokenizer) NextToken() (*Token, error) {
- return t.scanStream()
- }
- /*
- Split a string in to a slice of strings, based upon shell-style rules for
- quoting, escaping, and spaces.
- */
- func Split(s string) ([]string, error) {
- l, err := NewLexer(strings.NewReader(s))
- if err != nil {
- return nil, err
- }
- subStrings := []string{}
- for {
- word, err := l.NextWord()
- if err != nil {
- if err == io.EOF {
- return subStrings, nil
- }
- return subStrings, err
- }
- subStrings = append(subStrings, word)
- }
- return subStrings, nil
- }
|