Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/internal/scannerhooks"
14 "go/token"
15 "path/filepath"
16 "strconv"
17 "unicode"
18 "unicode/utf8"
19 )
20
21
22
23
24
25 type ErrorHandler func(pos token.Position, msg string)
26
27
28
29
30 type Scanner struct {
31
32 file *token.File
33 dir string
34 src []byte
35 err ErrorHandler
36 mode Mode
37
38
39 ch rune
40 offset int
41 rdOffset int
42 lineOffset int
43 insertSemi bool
44 nlPos token.Pos
45 stringEnd token.Pos
46
47
48 ErrorCount int
49 }
50
51
52 func init() {
53 scannerhooks.StringEnd = func(scanner any) token.Pos {
54 return scanner.(*Scanner).stringEnd
55 }
56 }
57
58 const (
59 bom = 0xFEFF
60 eof = -1
61 )
62
63
64
65
66
67
68 func (s *Scanner) next() {
69 if s.rdOffset < len(s.src) {
70 s.offset = s.rdOffset
71 if s.ch == '\n' {
72 s.lineOffset = s.offset
73 s.file.AddLine(s.offset)
74 }
75 r, w := rune(s.src[s.rdOffset]), 1
76 switch {
77 case r == 0:
78 s.error(s.offset, "illegal character NUL")
79 case r >= utf8.RuneSelf:
80
81 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
82 if r == utf8.RuneError && w == 1 {
83 in := s.src[s.rdOffset:]
84 if s.offset == 0 &&
85 len(in) >= 2 &&
86 (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
87
88
89 s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
90 s.rdOffset += len(in)
91 } else {
92 s.error(s.offset, "illegal UTF-8 encoding")
93 }
94 } else if r == bom && s.offset > 0 {
95 s.error(s.offset, "illegal byte order mark")
96 }
97 }
98 s.rdOffset += w
99 s.ch = r
100 } else {
101 s.offset = len(s.src)
102 if s.ch == '\n' {
103 s.lineOffset = s.offset
104 s.file.AddLine(s.offset)
105 }
106 s.ch = eof
107 }
108 }
109
110
111
112 func (s *Scanner) peek() byte {
113 if s.rdOffset < len(s.src) {
114 return s.src[s.rdOffset]
115 }
116 return 0
117 }
118
119
120
121 type Mode uint
122
123 const (
124 ScanComments Mode = 1 << iota
125 dontInsertSemis
126 )
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
143
144 if file.Size() != len(src) {
145 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
146 }
147 s.file = file
148 s.dir, _ = filepath.Split(file.Name())
149 s.src = src
150 s.err = err
151 s.mode = mode
152
153 s.ch = ' '
154 s.offset = 0
155 s.rdOffset = 0
156 s.lineOffset = 0
157 s.insertSemi = false
158 s.ErrorCount = 0
159
160 s.next()
161 if s.ch == bom {
162 s.next()
163 }
164 }
165
166 func (s *Scanner) error(offs int, msg string) {
167 if s.err != nil {
168 s.err(s.file.Position(s.file.Pos(offs)), msg)
169 }
170 s.ErrorCount++
171 }
172
173 func (s *Scanner) errorf(offs int, format string, args ...any) {
174 s.error(offs, fmt.Sprintf(format, args...))
175 }
176
177
178
179
180 func (s *Scanner) scanComment() (string, int) {
181
182 offs := s.offset - 1
183 next := -1
184 numCR := 0
185 nlOffset := 0
186
187 if s.ch == '/' {
188
189
190 s.next()
191 for s.ch != '\n' && s.ch >= 0 {
192 if s.ch == '\r' {
193 numCR++
194 }
195 s.next()
196 }
197
198 next = s.offset
199 if s.ch == '\n' {
200 next++
201 }
202 goto exit
203 }
204
205
206 s.next()
207 for s.ch >= 0 {
208 ch := s.ch
209 if ch == '\r' {
210 numCR++
211 } else if ch == '\n' && nlOffset == 0 {
212 nlOffset = s.offset
213 }
214 s.next()
215 if ch == '*' && s.ch == '/' {
216 s.next()
217 next = s.offset
218 goto exit
219 }
220 }
221
222 s.error(offs, "comment not terminated")
223
224 exit:
225 lit := s.src[offs:s.offset]
226
227
228
229
230
231
232 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
233 lit = lit[:len(lit)-1]
234 numCR--
235 }
236
237
238
239 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
240 s.updateLineInfo(next, offs, lit)
241 }
242
243 if numCR > 0 {
244 lit = stripCR(lit, lit[1] == '*')
245 }
246
247 return string(lit), nlOffset
248 }
249
250 var prefix = []byte("line ")
251
252
253
254
255 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
256
257 if text[1] == '*' {
258 text = text[:len(text)-2]
259 }
260 text = text[7:]
261 offs += 7
262
263 i, n, ok := trailingDigits(text)
264 if i == 0 {
265 return
266 }
267
268
269 if !ok {
270
271 s.error(offs+i, "invalid line number: "+string(text[i:]))
272 return
273 }
274
275
276
277
278 const maxLineCol = 1 << 30
279 var line, col int
280 i2, n2, ok2 := trailingDigits(text[:i-1])
281 if ok2 {
282
283 i, i2 = i2, i
284 line, col = n2, n
285 if col == 0 || col > maxLineCol {
286 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
287 return
288 }
289 text = text[:i2-1]
290 } else {
291
292 line = n
293 }
294
295 if line == 0 || line > maxLineCol {
296 s.error(offs+i, "invalid line number: "+string(text[i:]))
297 return
298 }
299
300
301
302 filename := string(text[:i-1])
303 if filename == "" && ok2 {
304 filename = s.file.Position(s.file.Pos(offs)).Filename
305 } else if filename != "" {
306
307
308
309 filename = filepath.Clean(filename)
310 if !filepath.IsAbs(filename) {
311 filename = filepath.Join(s.dir, filename)
312 }
313 }
314
315 s.file.AddLineColumnInfo(next, filename, line, col)
316 }
317
318 func trailingDigits(text []byte) (int, int, bool) {
319 i := bytes.LastIndexByte(text, ':')
320 if i < 0 {
321 return 0, 0, false
322 }
323
324 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
325 return i + 1, int(n), err == nil
326 }
327
328 func isLetter(ch rune) bool {
329 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
330 }
331
332 func isDigit(ch rune) bool {
333 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
334 }
335
336
337
338
339
340
341 func (s *Scanner) scanIdentifier() string {
342 offs := s.offset
343
344
345
346
347
348
349
350
351 for rdOffset, b := range s.src[s.rdOffset:] {
352 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
353
354 continue
355 }
356 s.rdOffset += rdOffset
357 if 0 < b && b < utf8.RuneSelf {
358
359
360
361
362
363
364 s.ch = rune(b)
365 s.offset = s.rdOffset
366 s.rdOffset++
367 goto exit
368 }
369
370
371
372 s.next()
373 for isLetter(s.ch) || isDigit(s.ch) {
374 s.next()
375 }
376 goto exit
377 }
378 s.offset = len(s.src)
379 s.rdOffset = len(s.src)
380 s.ch = eof
381
382 exit:
383 return string(s.src[offs:s.offset])
384 }
385
386 func digitVal(ch rune) int {
387 switch {
388 case '0' <= ch && ch <= '9':
389 return int(ch - '0')
390 case 'a' <= lower(ch) && lower(ch) <= 'f':
391 return int(lower(ch) - 'a' + 10)
392 }
393 return 16
394 }
395
396 func lower(ch rune) rune { return ('a' - 'A') | ch }
397 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
398 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
399
400
401
402
403
404
405
406 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
407 if base <= 10 {
408 max := rune('0' + base)
409 for isDecimal(s.ch) || s.ch == '_' {
410 ds := 1
411 if s.ch == '_' {
412 ds = 2
413 } else if s.ch >= max && *invalid < 0 {
414 *invalid = s.offset
415 }
416 digsep |= ds
417 s.next()
418 }
419 } else {
420 for isHex(s.ch) || s.ch == '_' {
421 ds := 1
422 if s.ch == '_' {
423 ds = 2
424 }
425 digsep |= ds
426 s.next()
427 }
428 }
429 return
430 }
431
432 func (s *Scanner) scanNumber() (token.Token, string) {
433 offs := s.offset
434 tok := token.ILLEGAL
435
436 base := 10
437 prefix := rune(0)
438 digsep := 0
439 invalid := -1
440
441
442 if s.ch != '.' {
443 tok = token.INT
444 if s.ch == '0' {
445 s.next()
446 switch lower(s.ch) {
447 case 'x':
448 s.next()
449 base, prefix = 16, 'x'
450 case 'o':
451 s.next()
452 base, prefix = 8, 'o'
453 case 'b':
454 s.next()
455 base, prefix = 2, 'b'
456 default:
457 base, prefix = 8, '0'
458 digsep = 1
459 }
460 }
461 digsep |= s.digits(base, &invalid)
462 }
463
464
465 if s.ch == '.' {
466 tok = token.FLOAT
467 if prefix == 'o' || prefix == 'b' {
468 s.error(s.offset, "invalid radix point in "+litname(prefix))
469 }
470 s.next()
471 digsep |= s.digits(base, &invalid)
472 }
473
474 if digsep&1 == 0 {
475 s.error(s.offset, litname(prefix)+" has no digits")
476 }
477
478
479 if e := lower(s.ch); e == 'e' || e == 'p' {
480 switch {
481 case e == 'e' && prefix != 0 && prefix != '0':
482 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
483 case e == 'p' && prefix != 'x':
484 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
485 }
486 s.next()
487 tok = token.FLOAT
488 if s.ch == '+' || s.ch == '-' {
489 s.next()
490 }
491 ds := s.digits(10, nil)
492 digsep |= ds
493 if ds&1 == 0 {
494 s.error(s.offset, "exponent has no digits")
495 }
496 } else if prefix == 'x' && tok == token.FLOAT {
497 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
498 }
499
500
501 if s.ch == 'i' {
502 tok = token.IMAG
503 s.next()
504 }
505
506 lit := string(s.src[offs:s.offset])
507 if tok == token.INT && invalid >= 0 {
508 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
509 }
510 if digsep&2 != 0 {
511 if i := invalidSep(lit); i >= 0 {
512 s.error(offs+i, "'_' must separate successive digits")
513 }
514 }
515
516 return tok, lit
517 }
518
519 func litname(prefix rune) string {
520 switch prefix {
521 case 'x':
522 return "hexadecimal literal"
523 case 'o', '0':
524 return "octal literal"
525 case 'b':
526 return "binary literal"
527 }
528 return "decimal literal"
529 }
530
531
532 func invalidSep(x string) int {
533 x1 := ' '
534 d := '.'
535 i := 0
536
537
538 if len(x) >= 2 && x[0] == '0' {
539 x1 = lower(rune(x[1]))
540 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
541 d = '0'
542 i = 2
543 }
544 }
545
546
547 for ; i < len(x); i++ {
548 p := d
549 d = rune(x[i])
550 switch {
551 case d == '_':
552 if p != '0' {
553 return i
554 }
555 case isDecimal(d) || x1 == 'x' && isHex(d):
556 d = '0'
557 default:
558 if p == '_' {
559 return i - 1
560 }
561 d = '.'
562 }
563 }
564 if d == '_' {
565 return len(x) - 1
566 }
567
568 return -1
569 }
570
571
572
573
574
575 func (s *Scanner) scanEscape(quote rune) bool {
576 offs := s.offset
577
578 var n int
579 var base, max uint32
580 switch s.ch {
581 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
582 s.next()
583 return true
584 case '0', '1', '2', '3', '4', '5', '6', '7':
585 n, base, max = 3, 8, 255
586 case 'x':
587 s.next()
588 n, base, max = 2, 16, 255
589 case 'u':
590 s.next()
591 n, base, max = 4, 16, unicode.MaxRune
592 case 'U':
593 s.next()
594 n, base, max = 8, 16, unicode.MaxRune
595 default:
596 msg := "unknown escape sequence"
597 if s.ch < 0 {
598 msg = "escape sequence not terminated"
599 }
600 s.error(offs, msg)
601 return false
602 }
603
604 var x uint32
605 for n > 0 {
606 d := uint32(digitVal(s.ch))
607 if d >= base {
608 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
609 if s.ch < 0 {
610 msg = "escape sequence not terminated"
611 }
612 s.error(s.offset, msg)
613 return false
614 }
615 x = x*base + d
616 s.next()
617 n--
618 }
619
620 if x > max || 0xD800 <= x && x < 0xE000 {
621 s.error(offs, "escape sequence is invalid Unicode code point")
622 return false
623 }
624
625 return true
626 }
627
628 func (s *Scanner) scanRune() string {
629
630 offs := s.offset - 1
631
632 valid := true
633 n := 0
634 for {
635 ch := s.ch
636 if ch == '\n' || ch < 0 {
637
638 if valid {
639 s.error(offs, "rune literal not terminated")
640 valid = false
641 }
642 break
643 }
644 s.next()
645 if ch == '\'' {
646 break
647 }
648 n++
649 if ch == '\\' {
650 if !s.scanEscape('\'') {
651 valid = false
652 }
653
654 }
655 }
656
657 if valid && n != 1 {
658 s.error(offs, "illegal rune literal")
659 }
660
661 return string(s.src[offs:s.offset])
662 }
663
664 func (s *Scanner) scanString() string {
665
666 offs := s.offset - 1
667
668 for {
669 ch := s.ch
670 if ch == '\n' || ch < 0 {
671 s.error(offs, "string literal not terminated")
672 break
673 }
674 s.next()
675 if ch == '"' {
676 break
677 }
678 if ch == '\\' {
679 s.scanEscape('"')
680 }
681 }
682
683 return string(s.src[offs:s.offset])
684 }
685
686 func stripCR(b []byte, comment bool) []byte {
687 c := make([]byte, len(b))
688 i := 0
689 for j, ch := range b {
690
691
692
693
694
695 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
696 c[i] = ch
697 i++
698 }
699 }
700 return c[:i]
701 }
702
703 func (s *Scanner) scanRawString() (string, int) {
704
705 offs := s.offset - 1
706
707 hasCR := false
708 for {
709 ch := s.ch
710 if ch < 0 {
711 s.error(offs, "raw string literal not terminated")
712 break
713 }
714 s.next()
715 if ch == '`' {
716 break
717 }
718 if ch == '\r' {
719 hasCR = true
720 }
721 }
722
723 lit := s.src[offs:s.offset]
724 rawLen := len(lit)
725 if hasCR {
726 lit = stripCR(lit, false)
727 }
728
729 return string(lit), rawLen
730 }
731
732 func (s *Scanner) skipWhitespace() {
733 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
734 s.next()
735 }
736 }
737
738
739
740
741
742
743
744 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
745 if s.ch == '=' {
746 s.next()
747 return tok1
748 }
749 return tok0
750 }
751
752 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
753 if s.ch == '=' {
754 s.next()
755 return tok1
756 }
757 if s.ch == ch2 {
758 s.next()
759 return tok2
760 }
761 return tok0
762 }
763
764 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
765 if s.ch == '=' {
766 s.next()
767 return tok1
768 }
769 if s.ch == ch2 {
770 s.next()
771 if s.ch == '=' {
772 s.next()
773 return tok3
774 }
775 return tok2
776 }
777 return tok0
778 }
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
811 scanAgain:
812 if s.nlPos.IsValid() {
813
814
815 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
816 s.nlPos = token.NoPos
817 return
818 }
819
820 s.skipWhitespace()
821
822
823 pos = s.file.Pos(s.offset)
824
825
826 insertSemi := false
827 switch ch := s.ch; {
828 case isLetter(ch):
829 lit = s.scanIdentifier()
830 if len(lit) > 1 {
831
832 tok = token.Lookup(lit)
833 switch tok {
834 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
835 insertSemi = true
836 }
837 } else {
838 insertSemi = true
839 tok = token.IDENT
840 }
841 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
842 insertSemi = true
843 tok, lit = s.scanNumber()
844 default:
845 s.next()
846 switch ch {
847 case eof:
848 if s.insertSemi {
849 s.insertSemi = false
850 return pos, token.SEMICOLON, "\n"
851 }
852 tok = token.EOF
853 case '\n':
854
855
856
857 s.insertSemi = false
858 return pos, token.SEMICOLON, "\n"
859 case '"':
860 insertSemi = true
861 tok = token.STRING
862 lit = s.scanString()
863 s.stringEnd = pos + token.Pos(len(lit))
864 case '\'':
865 insertSemi = true
866 tok = token.CHAR
867 lit = s.scanRune()
868 case '`':
869 insertSemi = true
870 tok = token.STRING
871 var rawLen int
872 lit, rawLen = s.scanRawString()
873 s.stringEnd = pos + token.Pos(rawLen)
874 case ':':
875 tok = s.switch2(token.COLON, token.DEFINE)
876 case '.':
877
878 tok = token.PERIOD
879 if s.ch == '.' && s.peek() == '.' {
880 s.next()
881 s.next()
882 tok = token.ELLIPSIS
883 }
884 case ',':
885 tok = token.COMMA
886 case ';':
887 tok = token.SEMICOLON
888 lit = ";"
889 case '(':
890 tok = token.LPAREN
891 case ')':
892 insertSemi = true
893 tok = token.RPAREN
894 case '[':
895 tok = token.LBRACK
896 case ']':
897 insertSemi = true
898 tok = token.RBRACK
899 case '{':
900 tok = token.LBRACE
901 case '}':
902 insertSemi = true
903 tok = token.RBRACE
904 case '+':
905 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
906 if tok == token.INC {
907 insertSemi = true
908 }
909 case '-':
910 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
911 if tok == token.DEC {
912 insertSemi = true
913 }
914 case '*':
915 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
916 case '/':
917 if s.ch == '/' || s.ch == '*' {
918
919 comment, nlOffset := s.scanComment()
920 if s.insertSemi && nlOffset != 0 {
921
922
923 s.nlPos = s.file.Pos(nlOffset)
924 s.insertSemi = false
925 } else {
926 insertSemi = s.insertSemi
927 }
928 if s.mode&ScanComments == 0 {
929
930 goto scanAgain
931 }
932 tok = token.COMMENT
933 lit = comment
934 } else {
935
936 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
937 }
938 case '%':
939 tok = s.switch2(token.REM, token.REM_ASSIGN)
940 case '^':
941 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
942 case '<':
943 if s.ch == '-' {
944 s.next()
945 tok = token.ARROW
946 } else {
947 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
948 }
949 case '>':
950 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
951 case '=':
952 tok = s.switch2(token.ASSIGN, token.EQL)
953 case '!':
954 tok = s.switch2(token.NOT, token.NEQ)
955 case '&':
956 if s.ch == '^' {
957 s.next()
958 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
959 } else {
960 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
961 }
962 case '|':
963 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
964 case '~':
965 tok = token.TILDE
966 default:
967
968 if ch != bom {
969
970
971 if ch == '“' || ch == '”' {
972 s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
973 } else {
974 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
975 }
976 }
977 insertSemi = s.insertSemi
978 tok = token.ILLEGAL
979 lit = string(ch)
980 }
981 }
982 if s.mode&dontInsertSemis == 0 {
983 s.insertSemi = insertSemi
984 }
985
986 return
987 }
988
View as plain text