Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19
20
21
22
23
24 type ErrorHandler func(pos token.Position, msg string)
25
26
27
28
29 type Scanner struct {
30
31 file *token.File
32 dir string
33 src []byte
34 err ErrorHandler
35 mode Mode
36
37
38 ch rune
39 offset int
40 rdOffset int
41 lineOffset int
42 insertSemi bool
43 nlPos token.Pos
44
45 endPosValid bool
46 endPos token.Pos
47
48
49 ErrorCount int
50 }
51
52 const (
53 bom = 0xFEFF
54 eof = -1
55 )
56
57
58
59
60
61
62 func (s *Scanner) next() {
63 if s.rdOffset < len(s.src) {
64 s.offset = s.rdOffset
65 if s.ch == '\n' {
66 s.lineOffset = s.offset
67 s.file.AddLine(s.offset)
68 }
69 r, w := rune(s.src[s.rdOffset]), 1
70 switch {
71 case r == 0:
72 s.error(s.offset, "illegal character NUL")
73 case r >= utf8.RuneSelf:
74
75 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
76 if r == utf8.RuneError && w == 1 {
77 in := s.src[s.rdOffset:]
78 if s.offset == 0 &&
79 len(in) >= 2 &&
80 (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
81
82
83 s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
84 s.rdOffset += len(in)
85 } else {
86 s.error(s.offset, "illegal UTF-8 encoding")
87 }
88 } else if r == bom && s.offset > 0 {
89 s.error(s.offset, "illegal byte order mark")
90 }
91 }
92 s.rdOffset += w
93 s.ch = r
94 } else {
95 s.offset = len(s.src)
96 if s.ch == '\n' {
97 s.lineOffset = s.offset
98 s.file.AddLine(s.offset)
99 }
100 s.ch = eof
101 }
102 }
103
104
105
106 func (s *Scanner) peek() byte {
107 if s.rdOffset < len(s.src) {
108 return s.src[s.rdOffset]
109 }
110 return 0
111 }
112
113
114
115 type Mode uint
116
117 const (
118 ScanComments Mode = 1 << iota
119 dontInsertSemis
120 )
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
137
138 if file.Size() != len(src) {
139 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
140 }
141
142 dir, _ := filepath.Split(file.Name())
143
144 *s = Scanner{
145 file: file,
146 dir: dir,
147 src: src,
148 err: err,
149 mode: mode,
150
151 ch: ' ',
152 endPosValid: true,
153 endPos: token.NoPos,
154 }
155
156 s.next()
157 if s.ch == bom {
158 s.next()
159 }
160 }
161
162 func (s *Scanner) error(offs int, msg string) {
163 if s.err != nil {
164 s.err(s.file.Position(s.file.Pos(offs)), msg)
165 }
166 s.ErrorCount++
167 }
168
169 func (s *Scanner) errorf(offs int, format string, args ...any) {
170 s.error(offs, fmt.Sprintf(format, args...))
171 }
172
173
174
175
176 func (s *Scanner) scanComment() (string, int) {
177
178 offs := s.offset - 1
179 next := -1
180 numCR := 0
181 nlOffset := 0
182
183 if s.ch == '/' {
184
185
186 s.next()
187 for s.ch != '\n' && s.ch >= 0 {
188 if s.ch == '\r' {
189 numCR++
190 }
191 s.next()
192 }
193
194 next = s.offset
195 if s.ch == '\n' {
196 next++
197 }
198 goto exit
199 }
200
201
202 s.next()
203 for s.ch >= 0 {
204 ch := s.ch
205 if ch == '\r' {
206 numCR++
207 } else if ch == '\n' && nlOffset == 0 {
208 nlOffset = s.offset
209 }
210 s.next()
211 if ch == '*' && s.ch == '/' {
212 s.next()
213 next = s.offset
214 goto exit
215 }
216 }
217
218 s.error(offs, "comment not terminated")
219
220 exit:
221 lit := s.src[offs:s.offset]
222
223
224
225
226
227
228 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
229 lit = lit[:len(lit)-1]
230 numCR--
231 }
232
233
234
235 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
236 s.updateLineInfo(next, offs, lit)
237 }
238
239 if numCR > 0 {
240 lit = stripCR(lit, lit[1] == '*')
241 }
242
243 return string(lit), nlOffset
244 }
245
246 var prefix = []byte("line ")
247
248
249
250
251 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
252
253 if text[1] == '*' {
254 text = text[:len(text)-2]
255 }
256 text = text[7:]
257 offs += 7
258
259 i, n, ok := trailingDigits(text)
260 if i == 0 {
261 return
262 }
263
264
265 if !ok {
266
267 s.error(offs+i, "invalid line number: "+string(text[i:]))
268 return
269 }
270
271
272
273
274 const maxLineCol = 1 << 30
275 var line, col int
276 i2, n2, ok2 := trailingDigits(text[:i-1])
277 if ok2 {
278
279 i, i2 = i2, i
280 line, col = n2, n
281 if col == 0 || col > maxLineCol {
282 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
283 return
284 }
285 text = text[:i2-1]
286 } else {
287
288 line = n
289 }
290
291 if line == 0 || line > maxLineCol {
292 s.error(offs+i, "invalid line number: "+string(text[i:]))
293 return
294 }
295
296
297
298 filename := string(text[:i-1])
299 if filename == "" && ok2 {
300 filename = s.file.Position(s.file.Pos(offs)).Filename
301 } else if filename != "" {
302
303
304
305 filename = filepath.Clean(filename)
306 if !filepath.IsAbs(filename) {
307 filename = filepath.Join(s.dir, filename)
308 }
309 }
310
311 s.file.AddLineColumnInfo(next, filename, line, col)
312 }
313
314 func trailingDigits(text []byte) (int, int, bool) {
315 i := bytes.LastIndexByte(text, ':')
316 if i < 0 {
317 return 0, 0, false
318 }
319
320 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
321 return i + 1, int(n), err == nil
322 }
323
324 func isLetter(ch rune) bool {
325 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
326 }
327
328 func isDigit(ch rune) bool {
329 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
330 }
331
332
333
334
335
336
337 func (s *Scanner) scanIdentifier() string {
338 offs := s.offset
339
340
341
342
343
344
345
346
347 for rdOffset, b := range s.src[s.rdOffset:] {
348 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
349
350 continue
351 }
352 s.rdOffset += rdOffset
353 if 0 < b && b < utf8.RuneSelf {
354
355
356
357
358
359
360 s.ch = rune(b)
361 s.offset = s.rdOffset
362 s.rdOffset++
363 goto exit
364 }
365
366
367
368 s.next()
369 for isLetter(s.ch) || isDigit(s.ch) {
370 s.next()
371 }
372 goto exit
373 }
374 s.offset = len(s.src)
375 s.rdOffset = len(s.src)
376 s.ch = eof
377
378 exit:
379 return string(s.src[offs:s.offset])
380 }
381
382 func digitVal(ch rune) int {
383 switch {
384 case '0' <= ch && ch <= '9':
385 return int(ch - '0')
386 case 'a' <= lower(ch) && lower(ch) <= 'f':
387 return int(lower(ch) - 'a' + 10)
388 }
389 return 16
390 }
391
392 func lower(ch rune) rune { return ('a' - 'A') | ch }
393 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
394 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
395
396
397
398
399
400
401
402 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
403 if base <= 10 {
404 max := rune('0' + base)
405 for isDecimal(s.ch) || s.ch == '_' {
406 ds := 1
407 if s.ch == '_' {
408 ds = 2
409 } else if s.ch >= max && *invalid < 0 {
410 *invalid = s.offset
411 }
412 digsep |= ds
413 s.next()
414 }
415 } else {
416 for isHex(s.ch) || s.ch == '_' {
417 ds := 1
418 if s.ch == '_' {
419 ds = 2
420 }
421 digsep |= ds
422 s.next()
423 }
424 }
425 return
426 }
427
428 func (s *Scanner) scanNumber() (token.Token, string) {
429 offs := s.offset
430 tok := token.ILLEGAL
431
432 base := 10
433 prefix := rune(0)
434 digsep := 0
435 invalid := -1
436
437
438 if s.ch != '.' {
439 tok = token.INT
440 if s.ch == '0' {
441 s.next()
442 switch lower(s.ch) {
443 case 'x':
444 s.next()
445 base, prefix = 16, 'x'
446 case 'o':
447 s.next()
448 base, prefix = 8, 'o'
449 case 'b':
450 s.next()
451 base, prefix = 2, 'b'
452 default:
453 base, prefix = 8, '0'
454 digsep = 1
455 }
456 }
457 digsep |= s.digits(base, &invalid)
458 }
459
460
461 if s.ch == '.' {
462 tok = token.FLOAT
463 if prefix == 'o' || prefix == 'b' {
464 s.error(s.offset, "invalid radix point in "+litname(prefix))
465 }
466 s.next()
467 digsep |= s.digits(base, &invalid)
468 }
469
470 if digsep&1 == 0 {
471 s.error(s.offset, litname(prefix)+" has no digits")
472 }
473
474
475 if e := lower(s.ch); e == 'e' || e == 'p' {
476 switch {
477 case e == 'e' && prefix != 0 && prefix != '0':
478 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
479 case e == 'p' && prefix != 'x':
480 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
481 }
482 s.next()
483 tok = token.FLOAT
484 if s.ch == '+' || s.ch == '-' {
485 s.next()
486 }
487 ds := s.digits(10, nil)
488 digsep |= ds
489 if ds&1 == 0 {
490 s.error(s.offset, "exponent has no digits")
491 }
492 } else if prefix == 'x' && tok == token.FLOAT {
493 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
494 }
495
496
497 if s.ch == 'i' {
498 tok = token.IMAG
499 s.next()
500 }
501
502 lit := string(s.src[offs:s.offset])
503 if tok == token.INT && invalid >= 0 {
504 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
505 }
506 if digsep&2 != 0 {
507 if i := invalidSep(lit); i >= 0 {
508 s.error(offs+i, "'_' must separate successive digits")
509 }
510 }
511
512 return tok, lit
513 }
514
515 func litname(prefix rune) string {
516 switch prefix {
517 case 'x':
518 return "hexadecimal literal"
519 case 'o', '0':
520 return "octal literal"
521 case 'b':
522 return "binary literal"
523 }
524 return "decimal literal"
525 }
526
527
528 func invalidSep(x string) int {
529 x1 := ' '
530 d := '.'
531 i := 0
532
533
534 if len(x) >= 2 && x[0] == '0' {
535 x1 = lower(rune(x[1]))
536 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
537 d = '0'
538 i = 2
539 }
540 }
541
542
543 for ; i < len(x); i++ {
544 p := d
545 d = rune(x[i])
546 switch {
547 case d == '_':
548 if p != '0' {
549 return i
550 }
551 case isDecimal(d) || x1 == 'x' && isHex(d):
552 d = '0'
553 default:
554 if p == '_' {
555 return i - 1
556 }
557 d = '.'
558 }
559 }
560 if d == '_' {
561 return len(x) - 1
562 }
563
564 return -1
565 }
566
567
568
569
570
571 func (s *Scanner) scanEscape(quote rune) bool {
572 offs := s.offset
573
574 var n int
575 var base, max uint32
576 switch s.ch {
577 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
578 s.next()
579 return true
580 case '0', '1', '2', '3', '4', '5', '6', '7':
581 n, base, max = 3, 8, 255
582 case 'x':
583 s.next()
584 n, base, max = 2, 16, 255
585 case 'u':
586 s.next()
587 n, base, max = 4, 16, unicode.MaxRune
588 case 'U':
589 s.next()
590 n, base, max = 8, 16, unicode.MaxRune
591 default:
592 msg := "unknown escape sequence"
593 if s.ch < 0 {
594 msg = "escape sequence not terminated"
595 }
596 s.error(offs, msg)
597 return false
598 }
599
600 var x uint32
601 for n > 0 {
602 d := uint32(digitVal(s.ch))
603 if d >= base {
604 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
605 if s.ch < 0 {
606 msg = "escape sequence not terminated"
607 }
608 s.error(s.offset, msg)
609 return false
610 }
611 x = x*base + d
612 s.next()
613 n--
614 }
615
616 if x > max || 0xD800 <= x && x < 0xE000 {
617 s.error(offs, "escape sequence is invalid Unicode code point")
618 return false
619 }
620
621 return true
622 }
623
624 func (s *Scanner) scanRune() string {
625
626 offs := s.offset - 1
627
628 valid := true
629 n := 0
630 for {
631 ch := s.ch
632 if ch == '\n' || ch < 0 {
633
634 if valid {
635 s.error(offs, "rune literal not terminated")
636 valid = false
637 }
638 break
639 }
640 s.next()
641 if ch == '\'' {
642 break
643 }
644 n++
645 if ch == '\\' {
646 if !s.scanEscape('\'') {
647 valid = false
648 }
649
650 }
651 }
652
653 if valid && n != 1 {
654 s.error(offs, "illegal rune literal")
655 }
656
657 return string(s.src[offs:s.offset])
658 }
659
660 func (s *Scanner) scanString() string {
661
662 offs := s.offset - 1
663
664 for {
665 ch := s.ch
666 if ch == '\n' || ch < 0 {
667 s.error(offs, "string literal not terminated")
668 break
669 }
670 s.next()
671 if ch == '"' {
672 break
673 }
674 if ch == '\\' {
675 s.scanEscape('"')
676 }
677 }
678
679 return string(s.src[offs:s.offset])
680 }
681
682 func stripCR(b []byte, comment bool) []byte {
683 c := make([]byte, len(b))
684 i := 0
685 for j, ch := range b {
686
687
688
689
690
691 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
692 c[i] = ch
693 i++
694 }
695 }
696 return c[:i]
697 }
698
699 func (s *Scanner) scanRawString() string {
700
701 offs := s.offset - 1
702
703 hasCR := false
704 for {
705 ch := s.ch
706 if ch < 0 {
707 s.error(offs, "raw string literal not terminated")
708 break
709 }
710 s.next()
711 if ch == '`' {
712 break
713 }
714 if ch == '\r' {
715 hasCR = true
716 }
717 }
718
719 lit := s.src[offs:s.offset]
720 if hasCR {
721 lit = stripCR(lit, false)
722 }
723
724 return string(lit)
725 }
726
727 func (s *Scanner) skipWhitespace() {
728 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
729 s.next()
730 }
731 }
732
733
734
735
736
737
738
739 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
740 if s.ch == '=' {
741 s.next()
742 return tok1
743 }
744 return tok0
745 }
746
747 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
748 if s.ch == '=' {
749 s.next()
750 return tok1
751 }
752 if s.ch == ch2 {
753 s.next()
754 return tok2
755 }
756 return tok0
757 }
758
759 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
760 if s.ch == '=' {
761 s.next()
762 return tok1
763 }
764 if s.ch == ch2 {
765 s.next()
766 if s.ch == '=' {
767 s.next()
768 return tok3
769 }
770 return tok2
771 }
772 return tok0
773 }
774
775
776
777 func (s *Scanner) End() token.Pos {
778
779
780
781
782 if s.endPosValid {
783 return s.endPos
784 }
785
786
787 return s.file.Pos(s.offset)
788 }
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
823 scanAgain:
824 s.endPosValid = false
825 if s.nlPos.IsValid() {
826
827
828 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
829 s.endPos = pos + 1
830 s.endPosValid = true
831 s.nlPos = token.NoPos
832 return
833 }
834
835 s.skipWhitespace()
836
837
838 pos = s.file.Pos(s.offset)
839
840
841 insertSemi := false
842 switch ch := s.ch; {
843 case isLetter(ch):
844 lit = s.scanIdentifier()
845 if len(lit) > 1 {
846
847 tok = token.Lookup(lit)
848 switch tok {
849 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
850 insertSemi = true
851 }
852 } else {
853 insertSemi = true
854 tok = token.IDENT
855 }
856 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
857 insertSemi = true
858 tok, lit = s.scanNumber()
859 default:
860 s.next()
861 switch ch {
862 case eof:
863 if s.insertSemi {
864 s.insertSemi = false
865 return pos, token.SEMICOLON, "\n"
866 }
867 tok = token.EOF
868 case '\n':
869
870
871
872 s.insertSemi = false
873 return pos, token.SEMICOLON, "\n"
874 case '"':
875 insertSemi = true
876 tok = token.STRING
877 lit = s.scanString()
878 case '\'':
879 insertSemi = true
880 tok = token.CHAR
881 lit = s.scanRune()
882 case '`':
883 insertSemi = true
884 tok = token.STRING
885 lit = s.scanRawString()
886 case ':':
887 tok = s.switch2(token.COLON, token.DEFINE)
888 case '.':
889
890 tok = token.PERIOD
891 if s.ch == '.' && s.peek() == '.' {
892 s.next()
893 s.next()
894 tok = token.ELLIPSIS
895 }
896 case ',':
897 tok = token.COMMA
898 case ';':
899 tok = token.SEMICOLON
900 lit = ";"
901 case '(':
902 tok = token.LPAREN
903 case ')':
904 insertSemi = true
905 tok = token.RPAREN
906 case '[':
907 tok = token.LBRACK
908 case ']':
909 insertSemi = true
910 tok = token.RBRACK
911 case '{':
912 tok = token.LBRACE
913 case '}':
914 insertSemi = true
915 tok = token.RBRACE
916 case '+':
917 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
918 if tok == token.INC {
919 insertSemi = true
920 }
921 case '-':
922 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
923 if tok == token.DEC {
924 insertSemi = true
925 }
926 case '*':
927 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
928 case '/':
929 if s.ch == '/' || s.ch == '*' {
930
931 comment, nlOffset := s.scanComment()
932 if s.insertSemi && nlOffset != 0 {
933
934
935 s.nlPos = s.file.Pos(nlOffset)
936 s.insertSemi = false
937 } else {
938 insertSemi = s.insertSemi
939 }
940 if s.mode&ScanComments == 0 {
941
942 goto scanAgain
943 }
944 tok = token.COMMENT
945 lit = comment
946 } else {
947
948 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
949 }
950 case '%':
951 tok = s.switch2(token.REM, token.REM_ASSIGN)
952 case '^':
953 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
954 case '<':
955 if s.ch == '-' {
956 s.next()
957 tok = token.ARROW
958 } else {
959 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
960 }
961 case '>':
962 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
963 case '=':
964 tok = s.switch2(token.ASSIGN, token.EQL)
965 case '!':
966 tok = s.switch2(token.NOT, token.NEQ)
967 case '&':
968 if s.ch == '^' {
969 s.next()
970 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
971 } else {
972 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
973 }
974 case '|':
975 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
976 case '~':
977 tok = token.TILDE
978 default:
979
980 if ch != bom {
981
982
983 if ch == '“' || ch == '”' {
984 s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
985 } else {
986 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
987 }
988 }
989 insertSemi = s.insertSemi
990 tok = token.ILLEGAL
991 lit = string(ch)
992 }
993 }
994 if s.mode&dontInsertSemis == 0 {
995 s.insertSemi = insertSemi
996 }
997
998 return
999 }
1000
View as plain text