Source file src/go/scanner/scanner.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package scanner implements a scanner for Go source text.
     6  // It takes a []byte as source which can then be tokenized
     7  // through repeated calls to the Scan method.
     8  package scanner
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"go/token"
    14  	"path/filepath"
    15  	"strconv"
    16  	"unicode"
    17  	"unicode/utf8"
    18  )
    19  
    20  // An ErrorHandler may be provided to [Scanner.Init]. If a syntax error is
    21  // encountered and a handler was installed, the handler is called with a
    22  // position and an error message. The position points to the beginning of
    23  // the offending token.
    24  type ErrorHandler func(pos token.Position, msg string)
    25  
    26  // A Scanner holds the scanner's internal state while processing
    27  // a given text. It can be allocated as part of another data
    28  // structure but must be initialized via [Scanner.Init] before use.
    29  type Scanner struct {
    30  	// immutable state
    31  	file *token.File  // source file handle
    32  	dir  string       // directory portion of file.Name()
    33  	src  []byte       // source
    34  	err  ErrorHandler // error reporting; or nil
    35  	mode Mode         // scanning mode
    36  
    37  	// scanning state
    38  	ch         rune      // current character
    39  	offset     int       // character offset
    40  	rdOffset   int       // reading offset (position after current character)
    41  	lineOffset int       // current line offset
    42  	insertSemi bool      // insert a semicolon before next newline
    43  	nlPos      token.Pos // position of newline in preceding comment
    44  
    45  	endPosValid bool
    46  	endPos      token.Pos // overrides the offset as the default end position
    47  
    48  	// public state - ok to modify
    49  	ErrorCount int // number of errors encountered
    50  }
    51  
    52  const (
    53  	bom = 0xFEFF // byte order mark, only permitted as very first character
    54  	eof = -1     // end of file
    55  )
    56  
    57  // Read the next Unicode char into s.ch.
    58  // s.ch < 0 means end-of-file.
    59  //
    60  // For optimization, there is some overlap between this method and
    61  // s.scanIdentifier.
    62  func (s *Scanner) next() {
    63  	if s.rdOffset < len(s.src) {
    64  		s.offset = s.rdOffset
    65  		if s.ch == '\n' {
    66  			s.lineOffset = s.offset
    67  			s.file.AddLine(s.offset)
    68  		}
    69  		r, w := rune(s.src[s.rdOffset]), 1
    70  		switch {
    71  		case r == 0:
    72  			s.error(s.offset, "illegal character NUL")
    73  		case r >= utf8.RuneSelf:
    74  			// not ASCII
    75  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    76  			if r == utf8.RuneError && w == 1 {
    77  				in := s.src[s.rdOffset:]
    78  				if s.offset == 0 &&
    79  					len(in) >= 2 &&
    80  					(in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
    81  					// U+FEFF BOM at start of file, encoded as big- or little-endian
    82  					// UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
    83  					s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
    84  					s.rdOffset += len(in) // consume all input to avoid error cascade
    85  				} else {
    86  					s.error(s.offset, "illegal UTF-8 encoding")
    87  				}
    88  			} else if r == bom && s.offset > 0 {
    89  				s.error(s.offset, "illegal byte order mark")
    90  			}
    91  		}
    92  		s.rdOffset += w
    93  		s.ch = r
    94  	} else {
    95  		s.offset = len(s.src)
    96  		if s.ch == '\n' {
    97  			s.lineOffset = s.offset
    98  			s.file.AddLine(s.offset)
    99  		}
   100  		s.ch = eof
   101  	}
   102  }
   103  
   104  // peek returns the byte following the most recently read character without
   105  // advancing the scanner. If the scanner is at EOF, peek returns 0.
   106  func (s *Scanner) peek() byte {
   107  	if s.rdOffset < len(s.src) {
   108  		return s.src[s.rdOffset]
   109  	}
   110  	return 0
   111  }
   112  
   113  // A Mode value is a set of flags (or 0).
   114  // They control scanner behavior.
   115  type Mode uint
   116  
   117  const (
   118  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   119  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
   120  )
   121  
   122  // Init prepares the scanner s to tokenize the text src by setting the
   123  // scanner at the beginning of src. The scanner uses the file set file
   124  // for position information and it adds line information for each line.
   125  // It is ok to re-use the same file when re-scanning the same file as
   126  // line information which is already present is ignored. Init causes a
   127  // panic if the file size does not match the src size.
   128  //
   129  // Calls to [Scanner.Scan] will invoke the error handler err if they encounter a
   130  // syntax error and err is not nil. Also, for each error encountered,
   131  // the [Scanner] field ErrorCount is incremented by one. The mode parameter
   132  // determines how comments are handled.
   133  //
   134  // Note that Init may call err if there is an error in the first character
   135  // of the file.
   136  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   137  	// Explicitly initialize all fields since a scanner may be reused.
   138  	if file.Size() != len(src) {
   139  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   140  	}
   141  
   142  	dir, _ := filepath.Split(file.Name())
   143  
   144  	*s = Scanner{
   145  		file: file,
   146  		dir:  dir,
   147  		src:  src,
   148  		err:  err,
   149  		mode: mode,
   150  
   151  		ch:          ' ',
   152  		endPosValid: true,
   153  		endPos:      token.NoPos,
   154  	}
   155  
   156  	s.next()
   157  	if s.ch == bom {
   158  		s.next() // ignore BOM at file beginning
   159  	}
   160  }
   161  
   162  func (s *Scanner) error(offs int, msg string) {
   163  	if s.err != nil {
   164  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   165  	}
   166  	s.ErrorCount++
   167  }
   168  
   169  func (s *Scanner) errorf(offs int, format string, args ...any) {
   170  	s.error(offs, fmt.Sprintf(format, args...))
   171  }
   172  
   173  // scanComment returns the text of the comment and (if nonzero)
   174  // the offset of the first newline within it, which implies a
   175  // /*...*/ comment.
   176  func (s *Scanner) scanComment() (string, int) {
   177  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   178  	offs := s.offset - 1 // position of initial '/'
   179  	next := -1           // position immediately following the comment; < 0 means invalid comment
   180  	numCR := 0
   181  	nlOffset := 0 // offset of first newline within /*...*/ comment
   182  
   183  	if s.ch == '/' {
   184  		//-style comment
   185  		// (the final '\n' is not considered part of the comment)
   186  		s.next()
   187  		for s.ch != '\n' && s.ch >= 0 {
   188  			if s.ch == '\r' {
   189  				numCR++
   190  			}
   191  			s.next()
   192  		}
   193  		// if we are at '\n', the position following the comment is afterwards
   194  		next = s.offset
   195  		if s.ch == '\n' {
   196  			next++
   197  		}
   198  		goto exit
   199  	}
   200  
   201  	/*-style comment */
   202  	s.next()
   203  	for s.ch >= 0 {
   204  		ch := s.ch
   205  		if ch == '\r' {
   206  			numCR++
   207  		} else if ch == '\n' && nlOffset == 0 {
   208  			nlOffset = s.offset
   209  		}
   210  		s.next()
   211  		if ch == '*' && s.ch == '/' {
   212  			s.next()
   213  			next = s.offset
   214  			goto exit
   215  		}
   216  	}
   217  
   218  	s.error(offs, "comment not terminated")
   219  
   220  exit:
   221  	lit := s.src[offs:s.offset]
   222  
   223  	// On Windows, a (//-comment) line may end in "\r\n".
   224  	// Remove the final '\r' before analyzing the text for
   225  	// line directives (matching the compiler). Remove any
   226  	// other '\r' afterwards (matching the pre-existing be-
   227  	// havior of the scanner).
   228  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   229  		lit = lit[:len(lit)-1]
   230  		numCR--
   231  	}
   232  
   233  	// interpret line directives
   234  	// (//line directives must start at the beginning of the current line)
   235  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   236  		s.updateLineInfo(next, offs, lit)
   237  	}
   238  
   239  	if numCR > 0 {
   240  		lit = stripCR(lit, lit[1] == '*')
   241  	}
   242  
   243  	return string(lit), nlOffset
   244  }
   245  
   246  var prefix = []byte("line ")
   247  
   248  // updateLineInfo parses the incoming comment text at offset offs
   249  // as a line directive. If successful, it updates the line info table
   250  // for the position next per the line directive.
   251  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   252  	// extract comment text
   253  	if text[1] == '*' {
   254  		text = text[:len(text)-2] // lop off trailing "*/"
   255  	}
   256  	text = text[7:] // lop off leading "//line " or "/*line "
   257  	offs += 7
   258  
   259  	i, n, ok := trailingDigits(text)
   260  	if i == 0 {
   261  		return // ignore (not a line directive)
   262  	}
   263  	// i > 0
   264  
   265  	if !ok {
   266  		// text has a suffix :xxx but xxx is not a number
   267  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   268  		return
   269  	}
   270  
   271  	// Put a cap on the maximum size of line and column numbers.
   272  	// 30 bits allows for some additional space before wrapping an int32.
   273  	// Keep this consistent with cmd/compile/internal/syntax.PosMax.
   274  	const maxLineCol = 1 << 30
   275  	var line, col int
   276  	i2, n2, ok2 := trailingDigits(text[:i-1])
   277  	if ok2 {
   278  		//line filename:line:col
   279  		i, i2 = i2, i
   280  		line, col = n2, n
   281  		if col == 0 || col > maxLineCol {
   282  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   283  			return
   284  		}
   285  		text = text[:i2-1] // lop off ":col"
   286  	} else {
   287  		//line filename:line
   288  		line = n
   289  	}
   290  
   291  	if line == 0 || line > maxLineCol {
   292  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   293  		return
   294  	}
   295  
   296  	// If we have a column (//line filename:line:col form),
   297  	// an empty filename means to use the previous filename.
   298  	filename := string(text[:i-1]) // lop off ":line", and trim white space
   299  	if filename == "" && ok2 {
   300  		filename = s.file.Position(s.file.Pos(offs)).Filename
   301  	} else if filename != "" {
   302  		// Put a relative filename in the current directory.
   303  		// This is for compatibility with earlier releases.
   304  		// See issue 26671.
   305  		filename = filepath.Clean(filename)
   306  		if !filepath.IsAbs(filename) {
   307  			filename = filepath.Join(s.dir, filename)
   308  		}
   309  	}
   310  
   311  	s.file.AddLineColumnInfo(next, filename, line, col)
   312  }
   313  
   314  func trailingDigits(text []byte) (int, int, bool) {
   315  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   316  	if i < 0 {
   317  		return 0, 0, false // no ":"
   318  	}
   319  	// i >= 0
   320  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   321  	return i + 1, int(n), err == nil
   322  }
   323  
   324  func isLetter(ch rune) bool {
   325  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   326  }
   327  
   328  func isDigit(ch rune) bool {
   329  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   330  }
   331  
   332  // scanIdentifier reads the string of valid identifier characters at s.offset.
   333  // It must only be called when s.ch is known to be a valid letter.
   334  //
   335  // Be careful when making changes to this function: it is optimized and affects
   336  // scanning performance significantly.
   337  func (s *Scanner) scanIdentifier() string {
   338  	offs := s.offset
   339  
   340  	// Optimize for the common case of an ASCII identifier.
   341  	//
   342  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   343  	// avoids conversions to runes.
   344  	//
   345  	// In case we encounter a non-ASCII character, fall back on the slower path
   346  	// of calling into s.next().
   347  	for rdOffset, b := range s.src[s.rdOffset:] {
   348  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   349  			// Avoid assigning a rune for the common case of an ascii character.
   350  			continue
   351  		}
   352  		s.rdOffset += rdOffset
   353  		if 0 < b && b < utf8.RuneSelf {
   354  			// Optimization: we've encountered an ASCII character that's not a letter
   355  			// or number. Avoid the call into s.next() and corresponding set up.
   356  			//
   357  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   358  			// shortcut is only possible because we know that the preceding character
   359  			// is not '\n'.
   360  			s.ch = rune(b)
   361  			s.offset = s.rdOffset
   362  			s.rdOffset++
   363  			goto exit
   364  		}
   365  		// We know that the preceding character is valid for an identifier because
   366  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   367  		// at s.rdOffset resets the scanner state.
   368  		s.next()
   369  		for isLetter(s.ch) || isDigit(s.ch) {
   370  			s.next()
   371  		}
   372  		goto exit
   373  	}
   374  	s.offset = len(s.src)
   375  	s.rdOffset = len(s.src)
   376  	s.ch = eof
   377  
   378  exit:
   379  	return string(s.src[offs:s.offset])
   380  }
   381  
   382  func digitVal(ch rune) int {
   383  	switch {
   384  	case '0' <= ch && ch <= '9':
   385  		return int(ch - '0')
   386  	case 'a' <= lower(ch) && lower(ch) <= 'f':
   387  		return int(lower(ch) - 'a' + 10)
   388  	}
   389  	return 16 // larger than any legal digit val
   390  }
   391  
   392  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   393  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   394  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   395  
   396  // digits accepts the sequence { digit | '_' }.
   397  // If base <= 10, digits accepts any decimal digit but records
   398  // the offset (relative to the source start) of a digit >= base
   399  // in *invalid, if *invalid < 0.
   400  // digits returns a bitset describing whether the sequence contained
   401  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   402  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   403  	if base <= 10 {
   404  		max := rune('0' + base)
   405  		for isDecimal(s.ch) || s.ch == '_' {
   406  			ds := 1
   407  			if s.ch == '_' {
   408  				ds = 2
   409  			} else if s.ch >= max && *invalid < 0 {
   410  				*invalid = s.offset // record invalid rune offset
   411  			}
   412  			digsep |= ds
   413  			s.next()
   414  		}
   415  	} else {
   416  		for isHex(s.ch) || s.ch == '_' {
   417  			ds := 1
   418  			if s.ch == '_' {
   419  				ds = 2
   420  			}
   421  			digsep |= ds
   422  			s.next()
   423  		}
   424  	}
   425  	return
   426  }
   427  
   428  func (s *Scanner) scanNumber() (token.Token, string) {
   429  	offs := s.offset
   430  	tok := token.ILLEGAL
   431  
   432  	base := 10        // number base
   433  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   434  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   435  	invalid := -1     // index of invalid digit in literal, or < 0
   436  
   437  	// integer part
   438  	if s.ch != '.' {
   439  		tok = token.INT
   440  		if s.ch == '0' {
   441  			s.next()
   442  			switch lower(s.ch) {
   443  			case 'x':
   444  				s.next()
   445  				base, prefix = 16, 'x'
   446  			case 'o':
   447  				s.next()
   448  				base, prefix = 8, 'o'
   449  			case 'b':
   450  				s.next()
   451  				base, prefix = 2, 'b'
   452  			default:
   453  				base, prefix = 8, '0'
   454  				digsep = 1 // leading 0
   455  			}
   456  		}
   457  		digsep |= s.digits(base, &invalid)
   458  	}
   459  
   460  	// fractional part
   461  	if s.ch == '.' {
   462  		tok = token.FLOAT
   463  		if prefix == 'o' || prefix == 'b' {
   464  			s.error(s.offset, "invalid radix point in "+litname(prefix))
   465  		}
   466  		s.next()
   467  		digsep |= s.digits(base, &invalid)
   468  	}
   469  
   470  	if digsep&1 == 0 {
   471  		s.error(s.offset, litname(prefix)+" has no digits")
   472  	}
   473  
   474  	// exponent
   475  	if e := lower(s.ch); e == 'e' || e == 'p' {
   476  		switch {
   477  		case e == 'e' && prefix != 0 && prefix != '0':
   478  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   479  		case e == 'p' && prefix != 'x':
   480  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   481  		}
   482  		s.next()
   483  		tok = token.FLOAT
   484  		if s.ch == '+' || s.ch == '-' {
   485  			s.next()
   486  		}
   487  		ds := s.digits(10, nil)
   488  		digsep |= ds
   489  		if ds&1 == 0 {
   490  			s.error(s.offset, "exponent has no digits")
   491  		}
   492  	} else if prefix == 'x' && tok == token.FLOAT {
   493  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   494  	}
   495  
   496  	// suffix 'i'
   497  	if s.ch == 'i' {
   498  		tok = token.IMAG
   499  		s.next()
   500  	}
   501  
   502  	lit := string(s.src[offs:s.offset])
   503  	if tok == token.INT && invalid >= 0 {
   504  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   505  	}
   506  	if digsep&2 != 0 {
   507  		if i := invalidSep(lit); i >= 0 {
   508  			s.error(offs+i, "'_' must separate successive digits")
   509  		}
   510  	}
   511  
   512  	return tok, lit
   513  }
   514  
   515  func litname(prefix rune) string {
   516  	switch prefix {
   517  	case 'x':
   518  		return "hexadecimal literal"
   519  	case 'o', '0':
   520  		return "octal literal"
   521  	case 'b':
   522  		return "binary literal"
   523  	}
   524  	return "decimal literal"
   525  }
   526  
   527  // invalidSep returns the index of the first invalid separator in x, or -1.
   528  func invalidSep(x string) int {
   529  	x1 := ' ' // prefix char, we only care if it's 'x'
   530  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   531  	i := 0
   532  
   533  	// a prefix counts as a digit
   534  	if len(x) >= 2 && x[0] == '0' {
   535  		x1 = lower(rune(x[1]))
   536  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   537  			d = '0'
   538  			i = 2
   539  		}
   540  	}
   541  
   542  	// mantissa and exponent
   543  	for ; i < len(x); i++ {
   544  		p := d // previous digit
   545  		d = rune(x[i])
   546  		switch {
   547  		case d == '_':
   548  			if p != '0' {
   549  				return i
   550  			}
   551  		case isDecimal(d) || x1 == 'x' && isHex(d):
   552  			d = '0'
   553  		default:
   554  			if p == '_' {
   555  				return i - 1
   556  			}
   557  			d = '.'
   558  		}
   559  	}
   560  	if d == '_' {
   561  		return len(x) - 1
   562  	}
   563  
   564  	return -1
   565  }
   566  
   567  // scanEscape parses an escape sequence where rune is the accepted
   568  // escaped quote. In case of a syntax error, it stops at the offending
   569  // character (without consuming it) and returns false. Otherwise
   570  // it returns true.
   571  func (s *Scanner) scanEscape(quote rune) bool {
   572  	offs := s.offset
   573  
   574  	var n int
   575  	var base, max uint32
   576  	switch s.ch {
   577  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   578  		s.next()
   579  		return true
   580  	case '0', '1', '2', '3', '4', '5', '6', '7':
   581  		n, base, max = 3, 8, 255
   582  	case 'x':
   583  		s.next()
   584  		n, base, max = 2, 16, 255
   585  	case 'u':
   586  		s.next()
   587  		n, base, max = 4, 16, unicode.MaxRune
   588  	case 'U':
   589  		s.next()
   590  		n, base, max = 8, 16, unicode.MaxRune
   591  	default:
   592  		msg := "unknown escape sequence"
   593  		if s.ch < 0 {
   594  			msg = "escape sequence not terminated"
   595  		}
   596  		s.error(offs, msg)
   597  		return false
   598  	}
   599  
   600  	var x uint32
   601  	for n > 0 {
   602  		d := uint32(digitVal(s.ch))
   603  		if d >= base {
   604  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   605  			if s.ch < 0 {
   606  				msg = "escape sequence not terminated"
   607  			}
   608  			s.error(s.offset, msg)
   609  			return false
   610  		}
   611  		x = x*base + d
   612  		s.next()
   613  		n--
   614  	}
   615  
   616  	if x > max || 0xD800 <= x && x < 0xE000 {
   617  		s.error(offs, "escape sequence is invalid Unicode code point")
   618  		return false
   619  	}
   620  
   621  	return true
   622  }
   623  
   624  func (s *Scanner) scanRune() string {
   625  	// '\'' opening already consumed
   626  	offs := s.offset - 1
   627  
   628  	valid := true
   629  	n := 0
   630  	for {
   631  		ch := s.ch
   632  		if ch == '\n' || ch < 0 {
   633  			// only report error if we don't have one already
   634  			if valid {
   635  				s.error(offs, "rune literal not terminated")
   636  				valid = false
   637  			}
   638  			break
   639  		}
   640  		s.next()
   641  		if ch == '\'' {
   642  			break
   643  		}
   644  		n++
   645  		if ch == '\\' {
   646  			if !s.scanEscape('\'') {
   647  				valid = false
   648  			}
   649  			// continue to read to closing quote
   650  		}
   651  	}
   652  
   653  	if valid && n != 1 {
   654  		s.error(offs, "illegal rune literal")
   655  	}
   656  
   657  	return string(s.src[offs:s.offset])
   658  }
   659  
   660  func (s *Scanner) scanString() string {
   661  	// '"' opening already consumed
   662  	offs := s.offset - 1
   663  
   664  	for {
   665  		ch := s.ch
   666  		if ch == '\n' || ch < 0 {
   667  			s.error(offs, "string literal not terminated")
   668  			break
   669  		}
   670  		s.next()
   671  		if ch == '"' {
   672  			break
   673  		}
   674  		if ch == '\\' {
   675  			s.scanEscape('"')
   676  		}
   677  	}
   678  
   679  	return string(s.src[offs:s.offset])
   680  }
   681  
   682  func stripCR(b []byte, comment bool) []byte {
   683  	c := make([]byte, len(b))
   684  	i := 0
   685  	for j, ch := range b {
   686  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   687  		// sequences of \r from *\r\r...\r/) since the resulting
   688  		// */ would terminate the comment too early unless the \r
   689  		// is immediately following the opening /* in which case
   690  		// it's ok because /*/ is not closed yet (issue #11151).
   691  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   692  			c[i] = ch
   693  			i++
   694  		}
   695  	}
   696  	return c[:i]
   697  }
   698  
   699  func (s *Scanner) scanRawString() string {
   700  	// '`' opening already consumed
   701  	offs := s.offset - 1
   702  
   703  	hasCR := false
   704  	for {
   705  		ch := s.ch
   706  		if ch < 0 {
   707  			s.error(offs, "raw string literal not terminated")
   708  			break
   709  		}
   710  		s.next()
   711  		if ch == '`' {
   712  			break
   713  		}
   714  		if ch == '\r' {
   715  			hasCR = true
   716  		}
   717  	}
   718  
   719  	lit := s.src[offs:s.offset]
   720  	if hasCR {
   721  		lit = stripCR(lit, false)
   722  	}
   723  
   724  	return string(lit)
   725  }
   726  
   727  func (s *Scanner) skipWhitespace() {
   728  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   729  		s.next()
   730  	}
   731  }
   732  
   733  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   734  // Different routines recognize different length tok_i based on matches
   735  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   736  // respectively. Otherwise, the result is tok0 if there was no other
   737  // matching character, or tok2 if the matching character was ch2.
   738  
   739  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   740  	if s.ch == '=' {
   741  		s.next()
   742  		return tok1
   743  	}
   744  	return tok0
   745  }
   746  
   747  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   748  	if s.ch == '=' {
   749  		s.next()
   750  		return tok1
   751  	}
   752  	if s.ch == ch2 {
   753  		s.next()
   754  		return tok2
   755  	}
   756  	return tok0
   757  }
   758  
   759  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   760  	if s.ch == '=' {
   761  		s.next()
   762  		return tok1
   763  	}
   764  	if s.ch == ch2 {
   765  		s.next()
   766  		if s.ch == '=' {
   767  			s.next()
   768  			return tok3
   769  		}
   770  		return tok2
   771  	}
   772  	return tok0
   773  }
   774  
   775  // End returns the position immediately after the last scanned token.
   776  // If [Scanner.Scan] has not been called yet, End returns [token.NoPos].
   777  func (s *Scanner) End() token.Pos {
   778  	// Handles special case:
   779  	// - Makes sure we return [token.NoPos], even when [Scanner.Init] has consumed a BOM.
   780  	// - When the previous token was a synthetic [token.SEMICOLON] inside a multi-line
   781  	//   comment, we make sure End returns its ending position (i.e. prevPos+len("\n")).
   782  	if s.endPosValid {
   783  		return s.endPos
   784  	}
   785  
   786  	// Normal case: s.file.Pos(s.offset) represents the end of the token
   787  	return s.file.Pos(s.offset)
   788  }
   789  
   790  // Scan scans the next token and returns the token position, the token,
   791  // and its literal string if applicable. The source end is indicated by
   792  // [token.EOF].
   793  //
   794  // If the returned token is a literal ([token.IDENT], [token.INT], [token.FLOAT],
   795  // [token.IMAG], [token.CHAR], [token.STRING]) or [token.COMMENT], the literal string
   796  // has the corresponding value.
   797  //
   798  // If the returned token is a keyword, the literal string is the keyword.
   799  //
   800  // If the returned token is [token.SEMICOLON], the corresponding
   801  // literal string is ";" if the semicolon was present in the source,
   802  // and "\n" if the semicolon was inserted because of a newline or
   803  // at EOF. If the newline is within a /*...*/ comment, the SEMICOLON token
   804  // is synthesized immediately after the COMMENT token; its position is that
   805  // of the actual newline within the comment.
   806  //
   807  // If the returned token is [token.ILLEGAL], the literal string is the
   808  // offending character.
   809  //
   810  // In all other cases, Scan returns an empty literal string.
   811  //
   812  // For more tolerant parsing, Scan will return a valid token if
   813  // possible even if a syntax error was encountered. Thus, even
   814  // if the resulting token sequence contains no illegal tokens,
   815  // a client may not assume that no error occurred. Instead it
   816  // must check the scanner's ErrorCount or the number of calls
   817  // of the error handler, if there was one installed.
   818  //
   819  // Scan adds line information to the file added to the file
   820  // set with Init. Token positions are relative to that file
   821  // and thus relative to the file set.
   822  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   823  scanAgain:
   824  	s.endPosValid = false
   825  	if s.nlPos.IsValid() {
   826  		// Return artificial ';' token after /*...*/ comment
   827  		// containing newline, at position of first newline.
   828  		pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
   829  		s.endPos = pos + 1
   830  		s.endPosValid = true
   831  		s.nlPos = token.NoPos
   832  		return
   833  	}
   834  
   835  	s.skipWhitespace()
   836  
   837  	// current token start
   838  	pos = s.file.Pos(s.offset)
   839  
   840  	// determine token value
   841  	insertSemi := false
   842  	switch ch := s.ch; {
   843  	case isLetter(ch):
   844  		lit = s.scanIdentifier()
   845  		if len(lit) > 1 {
   846  			// keywords are longer than one letter - avoid lookup otherwise
   847  			tok = token.Lookup(lit)
   848  			switch tok {
   849  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   850  				insertSemi = true
   851  			}
   852  		} else {
   853  			insertSemi = true
   854  			tok = token.IDENT
   855  		}
   856  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   857  		insertSemi = true
   858  		tok, lit = s.scanNumber()
   859  	default:
   860  		s.next() // always make progress
   861  		switch ch {
   862  		case eof:
   863  			if s.insertSemi {
   864  				s.insertSemi = false // EOF consumed
   865  				return pos, token.SEMICOLON, "\n"
   866  			}
   867  			tok = token.EOF
   868  		case '\n':
   869  			// we only reach here if s.insertSemi was
   870  			// set in the first place and exited early
   871  			// from s.skipWhitespace()
   872  			s.insertSemi = false // newline consumed
   873  			return pos, token.SEMICOLON, "\n"
   874  		case '"':
   875  			insertSemi = true
   876  			tok = token.STRING
   877  			lit = s.scanString()
   878  		case '\'':
   879  			insertSemi = true
   880  			tok = token.CHAR
   881  			lit = s.scanRune()
   882  		case '`':
   883  			insertSemi = true
   884  			tok = token.STRING
   885  			lit = s.scanRawString()
   886  		case ':':
   887  			tok = s.switch2(token.COLON, token.DEFINE)
   888  		case '.':
   889  			// fractions starting with a '.' are handled by outer switch
   890  			tok = token.PERIOD
   891  			if s.ch == '.' && s.peek() == '.' {
   892  				s.next()
   893  				s.next() // consume last '.'
   894  				tok = token.ELLIPSIS
   895  			}
   896  		case ',':
   897  			tok = token.COMMA
   898  		case ';':
   899  			tok = token.SEMICOLON
   900  			lit = ";"
   901  		case '(':
   902  			tok = token.LPAREN
   903  		case ')':
   904  			insertSemi = true
   905  			tok = token.RPAREN
   906  		case '[':
   907  			tok = token.LBRACK
   908  		case ']':
   909  			insertSemi = true
   910  			tok = token.RBRACK
   911  		case '{':
   912  			tok = token.LBRACE
   913  		case '}':
   914  			insertSemi = true
   915  			tok = token.RBRACE
   916  		case '+':
   917  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   918  			if tok == token.INC {
   919  				insertSemi = true
   920  			}
   921  		case '-':
   922  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   923  			if tok == token.DEC {
   924  				insertSemi = true
   925  			}
   926  		case '*':
   927  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   928  		case '/':
   929  			if s.ch == '/' || s.ch == '*' {
   930  				// comment
   931  				comment, nlOffset := s.scanComment()
   932  				if s.insertSemi && nlOffset != 0 {
   933  					// For /*...*/ containing \n, return
   934  					// COMMENT then artificial SEMICOLON.
   935  					s.nlPos = s.file.Pos(nlOffset)
   936  					s.insertSemi = false
   937  				} else {
   938  					insertSemi = s.insertSemi // preserve insertSemi info
   939  				}
   940  				if s.mode&ScanComments == 0 {
   941  					// skip comment
   942  					goto scanAgain
   943  				}
   944  				tok = token.COMMENT
   945  				lit = comment
   946  			} else {
   947  				// division
   948  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   949  			}
   950  		case '%':
   951  			tok = s.switch2(token.REM, token.REM_ASSIGN)
   952  		case '^':
   953  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   954  		case '<':
   955  			if s.ch == '-' {
   956  				s.next()
   957  				tok = token.ARROW
   958  			} else {
   959  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   960  			}
   961  		case '>':
   962  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   963  		case '=':
   964  			tok = s.switch2(token.ASSIGN, token.EQL)
   965  		case '!':
   966  			tok = s.switch2(token.NOT, token.NEQ)
   967  		case '&':
   968  			if s.ch == '^' {
   969  				s.next()
   970  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   971  			} else {
   972  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   973  			}
   974  		case '|':
   975  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   976  		case '~':
   977  			tok = token.TILDE
   978  		default:
   979  			// next reports unexpected BOMs - don't repeat
   980  			if ch != bom {
   981  				// Report an informative error for U+201[CD] quotation
   982  				// marks, which are easily introduced via copy and paste.
   983  				if ch == '“' || ch == '”' {
   984  					s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
   985  				} else {
   986  					s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   987  				}
   988  			}
   989  			insertSemi = s.insertSemi // preserve insertSemi info
   990  			tok = token.ILLEGAL
   991  			lit = string(ch)
   992  		}
   993  	}
   994  	if s.mode&dontInsertSemis == 0 {
   995  		s.insertSemi = insertSemi
   996  	}
   997  
   998  	return
   999  }
  1000  

View as plain text