scanner.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package scanner implements a scanner for Go source text.
     6  // It takes a []byte as source which can then be tokenized
     7  // through repeated calls to the Scan method.
     8  package scanner
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"go/internal/scannerhooks"
    14  	"go/token"
    15  	"path/filepath"
    16  	"strconv"
    17  	"unicode"
    18  	"unicode/utf8"
    19  )
    20  
    21  // An ErrorHandler may be provided to [Scanner.Init]. If a syntax error is
    22  // encountered and a handler was installed, the handler is called with a
    23  // position and an error message. The position points to the beginning of
    24  // the offending token.
    25  type ErrorHandler func(pos token.Position, msg string)
    26  
    27  // A Scanner holds the scanner's internal state while processing
    28  // a given text. It can be allocated as part of another data
    29  // structure but must be initialized via [Scanner.Init] before use.
    30  type Scanner struct {
    31  	// immutable state
    32  	file *token.File  // source file handle
    33  	dir  string       // directory portion of file.Name()
    34  	src  []byte       // source
    35  	err  ErrorHandler // error reporting; or nil
    36  	mode Mode         // scanning mode
    37  
    38  	// scanning state
    39  	ch         rune      // current character
    40  	offset     int       // character offset
    41  	rdOffset   int       // reading offset (position after current character)
    42  	lineOffset int       // current line offset
    43  	insertSemi bool      // insert a semicolon before next newline
    44  	nlPos      token.Pos // position of newline in preceding comment
    45  	stringEnd  token.Pos // end position; defined only for STRING tokens
    46  
    47  	// public state - ok to modify
    48  	ErrorCount int // number of errors encountered
    49  }
    50  
    51  // Provide go/parser with backdoor access to the StringEnd information.
    52  func init() {
    53  	scannerhooks.StringEnd = func(scanner any) token.Pos {
    54  		return scanner.(*Scanner).stringEnd
    55  	}
    56  }
    57  
    58  const (
    59  	bom = 0xFEFF // byte order mark, only permitted as very first character
    60  	eof = -1     // end of file
    61  )
    62  
    63  // Read the next Unicode char into s.ch.
    64  // s.ch < 0 means end-of-file.
    65  //
    66  // For optimization, there is some overlap between this method and
    67  // s.scanIdentifier.
    68  func (s *Scanner) next() {
    69  	if s.rdOffset < len(s.src) {
    70  		s.offset = s.rdOffset
    71  		if s.ch == '\n' {
    72  			s.lineOffset = s.offset
    73  			s.file.AddLine(s.offset)
    74  		}
    75  		r, w := rune(s.src[s.rdOffset]), 1
    76  		switch {
    77  		case r == 0:
    78  			s.error(s.offset, "illegal character NUL")
    79  		case r >= utf8.RuneSelf:
    80  			// not ASCII
    81  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    82  			if r == utf8.RuneError && w == 1 {
    83  				in := s.src[s.rdOffset:]
    84  				if s.offset == 0 &&
    85  					len(in) >= 2 &&
    86  					(in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
    87  					// U+FEFF BOM at start of file, encoded as big- or little-endian
    88  					// UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
    89  					s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
    90  					s.rdOffset += len(in) // consume all input to avoid error cascade
    91  				} else {
    92  					s.error(s.offset, "illegal UTF-8 encoding")
    93  				}
    94  			} else if r == bom && s.offset > 0 {
    95  				s.error(s.offset, "illegal byte order mark")
    96  			}
    97  		}
    98  		s.rdOffset += w
    99  		s.ch = r
   100  	} else {
   101  		s.offset = len(s.src)
   102  		if s.ch == '\n' {
   103  			s.lineOffset = s.offset
   104  			s.file.AddLine(s.offset)
   105  		}
   106  		s.ch = eof
   107  	}
   108  }
   109  
   110  // peek returns the byte following the most recently read character without
   111  // advancing the scanner. If the scanner is at EOF, peek returns 0.
   112  func (s *Scanner) peek() byte {
   113  	if s.rdOffset < len(s.src) {
   114  		return s.src[s.rdOffset]
   115  	}
   116  	return 0
   117  }
   118  
   119  // A Mode value is a set of flags (or 0).
   120  // They control scanner behavior.
   121  type Mode uint
   122  
   123  const (
   124  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   125  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
   126  )
   127  
   128  // Init prepares the scanner s to tokenize the text src by setting the
   129  // scanner at the beginning of src. The scanner uses the file set file
   130  // for position information and it adds line information for each line.
   131  // It is ok to re-use the same file when re-scanning the same file as
   132  // line information which is already present is ignored. Init causes a
   133  // panic if the file size does not match the src size.
   134  //
   135  // Calls to [Scanner.Scan] will invoke the error handler err if they encounter a
   136  // syntax error and err is not nil. Also, for each error encountered,
   137  // the [Scanner] field ErrorCount is incremented by one. The mode parameter
   138  // determines how comments are handled.
   139  //
   140  // Note that Init may call err if there is an error in the first character
   141  // of the file.
   142  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   143  	// Explicitly initialize all fields since a scanner may be reused.
   144  	if file.Size() != len(src) {
   145  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   146  	}
   147  	s.file = file
   148  	s.dir, _ = filepath.Split(file.Name())
   149  	s.src = src
   150  	s.err = err
   151  	s.mode = mode
   152  
   153  	s.ch = ' '
   154  	s.offset = 0
   155  	s.rdOffset = 0
   156  	s.lineOffset = 0
   157  	s.insertSemi = false
   158  	s.ErrorCount = 0
   159  
   160  	s.next()
   161  	if s.ch == bom {
   162  		s.next() // ignore BOM at file beginning
   163  	}
   164  }
   165  
   166  func (s *Scanner) error(offs int, msg string) {
   167  	if s.err != nil {
   168  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   169  	}
   170  	s.ErrorCount++
   171  }
   172  
   173  func (s *Scanner) errorf(offs int, format string, args ...any) {
   174  	s.error(offs, fmt.Sprintf(format, args...))
   175  }
   176  
   177  // scanComment returns the text of the comment and (if nonzero)
   178  // the offset of the first newline within it, which implies a
   179  // /*...*/ comment.
   180  func (s *Scanner) scanComment() (string, int) {
   181  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   182  	offs := s.offset - 1 // position of initial '/'
   183  	next := -1           // position immediately following the comment; < 0 means invalid comment
   184  	numCR := 0
   185  	nlOffset := 0 // offset of first newline within /*...*/ comment
   186  
   187  	if s.ch == '/' {
   188  		//-style comment
   189  		// (the final '\n' is not considered part of the comment)
   190  		s.next()
   191  		for s.ch != '\n' && s.ch >= 0 {
   192  			if s.ch == '\r' {
   193  				numCR++
   194  			}
   195  			s.next()
   196  		}
   197  		// if we are at '\n', the position following the comment is afterwards
   198  		next = s.offset
   199  		if s.ch == '\n' {
   200  			next++
   201  		}
   202  		goto exit
   203  	}
   204  
   205  	/*-style comment */
   206  	s.next()
   207  	for s.ch >= 0 {
   208  		ch := s.ch
   209  		if ch == '\r' {
   210  			numCR++
   211  		} else if ch == '\n' && nlOffset == 0 {
   212  			nlOffset = s.offset
   213  		}
   214  		s.next()
   215  		if ch == '*' && s.ch == '/' {
   216  			s.next()
   217  			next = s.offset
   218  			goto exit
   219  		}
   220  	}
   221  
   222  	s.error(offs, "comment not terminated")
   223  
   224  exit:
   225  	lit := s.src[offs:s.offset]
   226  
   227  	// On Windows, a (//-comment) line may end in "\r\n".
   228  	// Remove the final '\r' before analyzing the text for
   229  	// line directives (matching the compiler). Remove any
   230  	// other '\r' afterwards (matching the pre-existing be-
   231  	// havior of the scanner).
   232  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   233  		lit = lit[:len(lit)-1]
   234  		numCR--
   235  	}
   236  
   237  	// interpret line directives
   238  	// (//line directives must start at the beginning of the current line)
   239  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   240  		s.updateLineInfo(next, offs, lit)
   241  	}
   242  
   243  	if numCR > 0 {
   244  		lit = stripCR(lit, lit[1] == '*')
   245  	}
   246  
   247  	return string(lit), nlOffset
   248  }
   249  
   250  var prefix = []byte("line ")
   251  
   252  // updateLineInfo parses the incoming comment text at offset offs
   253  // as a line directive. If successful, it updates the line info table
   254  // for the position next per the line directive.
   255  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   256  	// extract comment text
   257  	if text[1] == '*' {
   258  		text = text[:len(text)-2] // lop off trailing "*/"
   259  	}
   260  	text = text[7:] // lop off leading "//line " or "/*line "
   261  	offs += 7
   262  
   263  	i, n, ok := trailingDigits(text)
   264  	if i == 0 {
   265  		return // ignore (not a line directive)
   266  	}
   267  	// i > 0
   268  
   269  	if !ok {
   270  		// text has a suffix :xxx but xxx is not a number
   271  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   272  		return
   273  	}
   274  
   275  	// Put a cap on the maximum size of line and column numbers.
   276  	// 30 bits allows for some additional space before wrapping an int32.
   277  	// Keep this consistent with cmd/compile/internal/syntax.PosMax.
   278  	const maxLineCol = 1 << 30
   279  	var line, col int
   280  	i2, n2, ok2 := trailingDigits(text[:i-1])
   281  	if ok2 {
   282  		//line filename:line:col
   283  		i, i2 = i2, i
   284  		line, col = n2, n
   285  		if col == 0 || col > maxLineCol {
   286  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   287  			return
   288  		}
   289  		text = text[:i2-1] // lop off ":col"
   290  	} else {
   291  		//line filename:line
   292  		line = n
   293  	}
   294  
   295  	if line == 0 || line > maxLineCol {
   296  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   297  		return
   298  	}
   299  
   300  	// If we have a column (//line filename:line:col form),
   301  	// an empty filename means to use the previous filename.
   302  	filename := string(text[:i-1]) // lop off ":line", and trim white space
   303  	if filename == "" && ok2 {
   304  		filename = s.file.Position(s.file.Pos(offs)).Filename
   305  	} else if filename != "" {
   306  		// Put a relative filename in the current directory.
   307  		// This is for compatibility with earlier releases.
   308  		// See issue 26671.
   309  		filename = filepath.Clean(filename)
   310  		if !filepath.IsAbs(filename) {
   311  			filename = filepath.Join(s.dir, filename)
   312  		}
   313  	}
   314  
   315  	s.file.AddLineColumnInfo(next, filename, line, col)
   316  }
   317  
   318  func trailingDigits(text []byte) (int, int, bool) {
   319  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   320  	if i < 0 {
   321  		return 0, 0, false // no ":"
   322  	}
   323  	// i >= 0
   324  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   325  	return i + 1, int(n), err == nil
   326  }
   327  
   328  func isLetter(ch rune) bool {
   329  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   330  }
   331  
   332  func isDigit(ch rune) bool {
   333  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   334  }
   335  
   336  // scanIdentifier reads the string of valid identifier characters at s.offset.
   337  // It must only be called when s.ch is known to be a valid letter.
   338  //
   339  // Be careful when making changes to this function: it is optimized and affects
   340  // scanning performance significantly.
   341  func (s *Scanner) scanIdentifier() string {
   342  	offs := s.offset
   343  
   344  	// Optimize for the common case of an ASCII identifier.
   345  	//
   346  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   347  	// avoids conversions to runes.
   348  	//
   349  	// In case we encounter a non-ASCII character, fall back on the slower path
   350  	// of calling into s.next().
   351  	for rdOffset, b := range s.src[s.rdOffset:] {
   352  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   353  			// Avoid assigning a rune for the common case of an ascii character.
   354  			continue
   355  		}
   356  		s.rdOffset += rdOffset
   357  		if 0 < b && b < utf8.RuneSelf {
   358  			// Optimization: we've encountered an ASCII character that's not a letter
   359  			// or number. Avoid the call into s.next() and corresponding set up.
   360  			//
   361  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   362  			// shortcut is only possible because we know that the preceding character
   363  			// is not '\n'.
   364  			s.ch = rune(b)
   365  			s.offset = s.rdOffset
   366  			s.rdOffset++
   367  			goto exit
   368  		}
   369  		// We know that the preceding character is valid for an identifier because
   370  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   371  		// at s.rdOffset resets the scanner state.
   372  		s.next()
   373  		for isLetter(s.ch) || isDigit(s.ch) {
   374  			s.next()
   375  		}
   376  		goto exit
   377  	}
   378  	s.offset = len(s.src)
   379  	s.rdOffset = len(s.src)
   380  	s.ch = eof
   381  
   382  exit:
   383  	return string(s.src[offs:s.offset])
   384  }
   385  
   386  func digitVal(ch rune) int {
   387  	switch {
   388  	case '0' <= ch && ch <= '9':
   389  		return int(ch - '0')
   390  	case 'a' <= lower(ch) && lower(ch) <= 'f':
   391  		return int(lower(ch) - 'a' + 10)
   392  	}
   393  	return 16 // larger than any legal digit val
   394  }
   395  
   396  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   397  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   398  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   399  
   400  // digits accepts the sequence { digit | '_' }.
   401  // If base <= 10, digits accepts any decimal digit but records
   402  // the offset (relative to the source start) of a digit >= base
   403  // in *invalid, if *invalid < 0.
   404  // digits returns a bitset describing whether the sequence contained
   405  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   406  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   407  	if base <= 10 {
   408  		max := rune('0' + base)
   409  		for isDecimal(s.ch) || s.ch == '_' {
   410  			ds := 1
   411  			if s.ch == '_' {
   412  				ds = 2
   413  			} else if s.ch >= max && *invalid < 0 {
   414  				*invalid = s.offset // record invalid rune offset
   415  			}
   416  			digsep |= ds
   417  			s.next()
   418  		}
   419  	} else {
   420  		for isHex(s.ch) || s.ch == '_' {
   421  			ds := 1
   422  			if s.ch == '_' {
   423  				ds = 2
   424  			}
   425  			digsep |= ds
   426  			s.next()
   427  		}
   428  	}
   429  	return
   430  }
   431  
   432  func (s *Scanner) scanNumber() (token.Token, string) {
   433  	offs := s.offset
   434  	tok := token.ILLEGAL
   435  
   436  	base := 10        // number base
   437  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   438  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   439  	invalid := -1     // index of invalid digit in literal, or < 0
   440  
   441  	// integer part
   442  	if s.ch != '.' {
   443  		tok = token.INT
   444  		if s.ch == '0' {
   445  			s.next()
   446  			switch lower(s.ch) {
   447  			case 'x':
   448  				s.next()
   449  				base, prefix = 16, 'x'
   450  			case 'o':
   451  				s.next()
   452  				base, prefix = 8, 'o'
   453  			case 'b':
   454  				s.next()
   455  				base, prefix = 2, 'b'
   456  			default:
   457  				base, prefix = 8, '0'
   458  				digsep = 1 // leading 0
   459  			}
   460  		}
   461  		digsep |= s.digits(base, &invalid)
   462  	}
   463  
   464  	// fractional part
   465  	if s.ch == '.' {
   466  		tok = token.FLOAT
   467  		if prefix == 'o' || prefix == 'b' {
   468  			s.error(s.offset, "invalid radix point in "+litname(prefix))
   469  		}
   470  		s.next()
   471  		digsep |= s.digits(base, &invalid)
   472  	}
   473  
   474  	if digsep&1 == 0 {
   475  		s.error(s.offset, litname(prefix)+" has no digits")
   476  	}
   477  
   478  	// exponent
   479  	if e := lower(s.ch); e == 'e' || e == 'p' {
   480  		switch {
   481  		case e == 'e' && prefix != 0 && prefix != '0':
   482  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   483  		case e == 'p' && prefix != 'x':
   484  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   485  		}
   486  		s.next()
   487  		tok = token.FLOAT
   488  		if s.ch == '+' || s.ch == '-' {
   489  			s.next()
   490  		}
   491  		ds := s.digits(10, nil)
   492  		digsep |= ds
   493  		if ds&1 == 0 {
   494  			s.error(s.offset, "exponent has no digits")
   495  		}
   496  	} else if prefix == 'x' && tok == token.FLOAT {
   497  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   498  	}
   499  
   500  	// suffix 'i'
   501  	if s.ch == 'i' {
   502  		tok = token.IMAG
   503  		s.next()
   504  	}
   505  
   506  	lit := string(s.src[offs:s.offset])
   507  	if tok == token.INT && invalid >= 0 {
   508  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   509  	}
   510  	if digsep&2 != 0 {
   511  		if i := invalidSep(lit); i >= 0 {
   512  			s.error(offs+i, "'_' must separate successive digits")
   513  		}
   514  	}
   515  
   516  	return tok, lit
   517  }
   518  
   519  func litname(prefix rune) string {
   520  	switch prefix {
   521  	case 'x':
   522  		return "hexadecimal literal"
   523  	case 'o', '0':
   524  		return "octal literal"
   525  	case 'b':
   526  		return "binary literal"
   527  	}
   528  	return "decimal literal"
   529  }
   530  
   531  // invalidSep returns the index of the first invalid separator in x, or -1.
   532  func invalidSep(x string) int {
   533  	x1 := ' ' // prefix char, we only care if it's 'x'
   534  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   535  	i := 0
   536  
   537  	// a prefix counts as a digit
   538  	if len(x) >= 2 && x[0] == '0' {
   539  		x1 = lower(rune(x[1]))
   540  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   541  			d = '0'
   542  			i = 2
   543  		}
   544  	}
   545  
   546  	// mantissa and exponent
   547  	for ; i < len(x); i++ {
   548  		p := d // previous digit
   549  		d = rune(x[i])
   550  		switch {
   551  		case d == '_':
   552  			if p != '0' {
   553  				return i
   554  			}
   555  		case isDecimal(d) || x1 == 'x' && isHex(d):
   556  			d = '0'
   557  		default:
   558  			if p == '_' {
   559  				return i - 1
   560  			}
   561  			d = '.'
   562  		}
   563  	}
   564  	if d == '_' {
   565  		return len(x) - 1
   566  	}
   567  
   568  	return -1
   569  }
   570  
   571  // scanEscape parses an escape sequence where rune is the accepted
   572  // escaped quote. In case of a syntax error, it stops at the offending
   573  // character (without consuming it) and returns false. Otherwise
   574  // it returns true.
   575  func (s *Scanner) scanEscape(quote rune) bool {
   576  	offs := s.offset
   577  
   578  	var n int
   579  	var base, max uint32
   580  	switch s.ch {
   581  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   582  		s.next()
   583  		return true
   584  	case '0', '1', '2', '3', '4', '5', '6', '7':
   585  		n, base, max = 3, 8, 255
   586  	case 'x':
   587  		s.next()
   588  		n, base, max = 2, 16, 255
   589  	case 'u':
   590  		s.next()
   591  		n, base, max = 4, 16, unicode.MaxRune
   592  	case 'U':
   593  		s.next()
   594  		n, base, max = 8, 16, unicode.MaxRune
   595  	default:
   596  		msg := "unknown escape sequence"
   597  		if s.ch < 0 {
   598  			msg = "escape sequence not terminated"
   599  		}
   600  		s.error(offs, msg)
   601  		return false
   602  	}
   603  
   604  	var x uint32
   605  	for n > 0 {
   606  		d := uint32(digitVal(s.ch))
   607  		if d >= base {
   608  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   609  			if s.ch < 0 {
   610  				msg = "escape sequence not terminated"
   611  			}
   612  			s.error(s.offset, msg)
   613  			return false
   614  		}
   615  		x = x*base + d
   616  		s.next()
   617  		n--
   618  	}
   619  
   620  	if x > max || 0xD800 <= x && x < 0xE000 {
   621  		s.error(offs, "escape sequence is invalid Unicode code point")
   622  		return false
   623  	}
   624  
   625  	return true
   626  }
   627  
   628  func (s *Scanner) scanRune() string {
   629  	// '\'' opening already consumed
   630  	offs := s.offset - 1
   631  
   632  	valid := true
   633  	n := 0
   634  	for {
   635  		ch := s.ch
   636  		if ch == '\n' || ch < 0 {
   637  			// only report error if we don't have one already
   638  			if valid {
   639  				s.error(offs, "rune literal not terminated")
   640  				valid = false
   641  			}
   642  			break
   643  		}
   644  		s.next()
   645  		if ch == '\'' {
   646  			break
   647  		}
   648  		n++
   649  		if ch == '\\' {
   650  			if !s.scanEscape('\'') {
   651  				valid = false
   652  			}
   653  			// continue to read to closing quote
   654  		}
   655  	}
   656  
   657  	if valid && n != 1 {
   658  		s.error(offs, "illegal rune literal")
   659  	}
   660  
   661  	return string(s.src[offs:s.offset])
   662  }
   663  
   664  func (s *Scanner) scanString() string {
   665  	// '"' opening already consumed
   666  	offs := s.offset - 1
   667  
   668  	for {
   669  		ch := s.ch
   670  		if ch == '\n' || ch < 0 {
   671  			s.error(offs, "string literal not terminated")
   672  			break
   673  		}
   674  		s.next()
   675  		if ch == '"' {
   676  			break
   677  		}
   678  		if ch == '\\' {
   679  			s.scanEscape('"')
   680  		}
   681  	}
   682  
   683  	return string(s.src[offs:s.offset])
   684  }
   685  
   686  func stripCR(b []byte, comment bool) []byte {
   687  	c := make([]byte, len(b))
   688  	i := 0
   689  	for j, ch := range b {
   690  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   691  		// sequences of \r from *\r\r...\r/) since the resulting
   692  		// */ would terminate the comment too early unless the \r
   693  		// is immediately following the opening /* in which case
   694  		// it's ok because /*/ is not closed yet (issue #11151).
   695  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   696  			c[i] = ch
   697  			i++
   698  		}
   699  	}
   700  	return c[:i]
   701  }
   702  
   703  func (s *Scanner) scanRawString() (string, int) {
   704  	// '`' opening already consumed
   705  	offs := s.offset - 1
   706  
   707  	hasCR := false
   708  	for {
   709  		ch := s.ch
   710  		if ch < 0 {
   711  			s.error(offs, "raw string literal not terminated")
   712  			break
   713  		}
   714  		s.next()
   715  		if ch == '`' {
   716  			break
   717  		}
   718  		if ch == '\r' {
   719  			hasCR = true
   720  		}
   721  	}
   722  
   723  	lit := s.src[offs:s.offset]
   724  	rawLen := len(lit)
   725  	if hasCR {
   726  		lit = stripCR(lit, false)
   727  	}
   728  
   729  	return string(lit), rawLen
   730  }
   731  
   732  func (s *Scanner) skipWhitespace() {
   733  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   734  		s.next()
   735  	}
   736  }
   737  
   738  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   739  // Different routines recognize different length tok_i based on matches
   740  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   741  // respectively. Otherwise, the result is tok0 if there was no other
   742  // matching character, or tok2 if the matching character was ch2.
   743  
   744  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   745  	if s.ch == '=' {
   746  		s.next()
   747  		return tok1
   748  	}
   749  	return tok0
   750  }
   751  
   752  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   753  	if s.ch == '=' {
   754  		s.next()
   755  		return tok1
   756  	}
   757  	if s.ch == ch2 {
   758  		s.next()
   759  		return tok2
   760  	}
   761  	return tok0
   762  }
   763  
   764  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   765  	if s.ch == '=' {
   766  		s.next()
   767  		return tok1
   768  	}
   769  	if s.ch == ch2 {
   770  		s.next()
   771  		if s.ch == '=' {
   772  			s.next()
   773  			return tok3
   774  		}
   775  		return tok2
   776  	}
   777  	return tok0
   778  }
   779  
   780  // Scan scans the next token and returns the token position, the token,
   781  // and its literal string if applicable. The source end is indicated by
   782  // [token.EOF].
   783  //
   784  // If the returned token is a literal ([token.IDENT], [token.INT], [token.FLOAT],
   785  // [token.IMAG], [token.CHAR], [token.STRING]) or [token.COMMENT], the literal string
   786  // has the corresponding value.
   787  //
   788  // If the returned token is a keyword, the literal string is the keyword.
   789  //
   790  // If the returned token is [token.SEMICOLON], the corresponding
   791  // literal string is ";" if the semicolon was present in the source,
   792  // and "\n" if the semicolon was inserted because of a newline or
   793  // at EOF.
   794  //
   795  // If the returned token is [token.ILLEGAL], the literal string is the
   796  // offending character.
   797  //
   798  // In all other cases, Scan returns an empty literal string.
   799  //
   800  // For more tolerant parsing, Scan will return a valid token if
   801  // possible even if a syntax error was encountered. Thus, even
   802  // if the resulting token sequence contains no illegal tokens,
   803  // a client may not assume that no error occurred. Instead it
   804  // must check the scanner's ErrorCount or the number of calls
   805  // of the error handler, if there was one installed.
   806  //
   807  // Scan adds line information to the file added to the file
   808  // set with Init. Token positions are relative to that file
   809  // and thus relative to the file set.
   810  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   811  scanAgain:
   812  	if s.nlPos.IsValid() {
   813  		// Return artificial ';' token after /*...*/ comment
   814  		// containing newline, at position of first newline.
   815  		pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
   816  		s.nlPos = token.NoPos
   817  		return
   818  	}
   819  
   820  	s.skipWhitespace()
   821  
   822  	// current token start
   823  	pos = s.file.Pos(s.offset)
   824  
   825  	// determine token value
   826  	insertSemi := false
   827  	switch ch := s.ch; {
   828  	case isLetter(ch):
   829  		lit = s.scanIdentifier()
   830  		if len(lit) > 1 {
   831  			// keywords are longer than one letter - avoid lookup otherwise
   832  			tok = token.Lookup(lit)
   833  			switch tok {
   834  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   835  				insertSemi = true
   836  			}
   837  		} else {
   838  			insertSemi = true
   839  			tok = token.IDENT
   840  		}
   841  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   842  		insertSemi = true
   843  		tok, lit = s.scanNumber()
   844  	default:
   845  		s.next() // always make progress
   846  		switch ch {
   847  		case eof:
   848  			if s.insertSemi {
   849  				s.insertSemi = false // EOF consumed
   850  				return pos, token.SEMICOLON, "\n"
   851  			}
   852  			tok = token.EOF
   853  		case '\n':
   854  			// we only reach here if s.insertSemi was
   855  			// set in the first place and exited early
   856  			// from s.skipWhitespace()
   857  			s.insertSemi = false // newline consumed
   858  			return pos, token.SEMICOLON, "\n"
   859  		case '"':
   860  			insertSemi = true
   861  			tok = token.STRING
   862  			lit = s.scanString()
   863  			s.stringEnd = pos + token.Pos(len(lit))
   864  		case '\'':
   865  			insertSemi = true
   866  			tok = token.CHAR
   867  			lit = s.scanRune()
   868  		case '`':
   869  			insertSemi = true
   870  			tok = token.STRING
   871  			var rawLen int
   872  			lit, rawLen = s.scanRawString()
   873  			s.stringEnd = pos + token.Pos(rawLen)
   874  		case ':':
   875  			tok = s.switch2(token.COLON, token.DEFINE)
   876  		case '.':
   877  			// fractions starting with a '.' are handled by outer switch
   878  			tok = token.PERIOD
   879  			if s.ch == '.' && s.peek() == '.' {
   880  				s.next()
   881  				s.next() // consume last '.'
   882  				tok = token.ELLIPSIS
   883  			}
   884  		case ',':
   885  			tok = token.COMMA
   886  		case ';':
   887  			tok = token.SEMICOLON
   888  			lit = ";"
   889  		case '(':
   890  			tok = token.LPAREN
   891  		case ')':
   892  			insertSemi = true
   893  			tok = token.RPAREN
   894  		case '[':
   895  			tok = token.LBRACK
   896  		case ']':
   897  			insertSemi = true
   898  			tok = token.RBRACK
   899  		case '{':
   900  			tok = token.LBRACE
   901  		case '}':
   902  			insertSemi = true
   903  			tok = token.RBRACE
   904  		case '+':
   905  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   906  			if tok == token.INC {
   907  				insertSemi = true
   908  			}
   909  		case '-':
   910  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   911  			if tok == token.DEC {
   912  				insertSemi = true
   913  			}
   914  		case '*':
   915  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   916  		case '/':
   917  			if s.ch == '/' || s.ch == '*' {
   918  				// comment
   919  				comment, nlOffset := s.scanComment()
   920  				if s.insertSemi && nlOffset != 0 {
   921  					// For /*...*/ containing \n, return
   922  					// COMMENT then artificial SEMICOLON.
   923  					s.nlPos = s.file.Pos(nlOffset)
   924  					s.insertSemi = false
   925  				} else {
   926  					insertSemi = s.insertSemi // preserve insertSemi info
   927  				}
   928  				if s.mode&ScanComments == 0 {
   929  					// skip comment
   930  					goto scanAgain
   931  				}
   932  				tok = token.COMMENT
   933  				lit = comment
   934  			} else {
   935  				// division
   936  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   937  			}
   938  		case '%':
   939  			tok = s.switch2(token.REM, token.REM_ASSIGN)
   940  		case '^':
   941  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   942  		case '<':
   943  			if s.ch == '-' {
   944  				s.next()
   945  				tok = token.ARROW
   946  			} else {
   947  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   948  			}
   949  		case '>':
   950  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   951  		case '=':
   952  			tok = s.switch2(token.ASSIGN, token.EQL)
   953  		case '!':
   954  			tok = s.switch2(token.NOT, token.NEQ)
   955  		case '&':
   956  			if s.ch == '^' {
   957  				s.next()
   958  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   959  			} else {
   960  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   961  			}
   962  		case '|':
   963  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   964  		case '~':
   965  			tok = token.TILDE
   966  		default:
   967  			// next reports unexpected BOMs - don't repeat
   968  			if ch != bom {
   969  				// Report an informative error for U+201[CD] quotation
   970  				// marks, which are easily introduced via copy and paste.
   971  				if ch == '“' || ch == '”' {
   972  					s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
   973  				} else {
   974  					s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   975  				}
   976  			}
   977  			insertSemi = s.insertSemi // preserve insertSemi info
   978  			tok = token.ILLEGAL
   979  			lit = string(ch)
   980  		}
   981  	}
   982  	if s.mode&dontInsertSemis == 0 {
   983  		s.insertSemi = insertSemi
   984  	}
   985  
   986  	return
   987  }
   988
View as plain text