Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19
20
21
22
23
24 type ErrorHandler func(pos token.Position, msg string)
25
26
27
28
29 type Scanner struct {
30
31 file *token.File
32 dir string
33 src []byte
34 err ErrorHandler
35 mode Mode
36
37
38 ch rune
39 offset int
40 rdOffset int
41 lineOffset int
42 insertSemi bool
43 nlPos token.Pos
44
45
46 ErrorCount int
47 }
48
49 const (
50 bom = 0xFEFF
51 eof = -1
52 )
53
54
55
56
57
58
59 func (s *Scanner) next() {
60 if s.rdOffset < len(s.src) {
61 s.offset = s.rdOffset
62 if s.ch == '\n' {
63 s.lineOffset = s.offset
64 s.file.AddLine(s.offset)
65 }
66 r, w := rune(s.src[s.rdOffset]), 1
67 switch {
68 case r == 0:
69 s.error(s.offset, "illegal character NUL")
70 case r >= utf8.RuneSelf:
71
72 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 in := s.src[s.rdOffset:]
75 if s.offset == 0 &&
76 len(in) >= 2 &&
77 (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
78
79
80 s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
81 s.rdOffset += len(in)
82 } else {
83 s.error(s.offset, "illegal UTF-8 encoding")
84 }
85 } else if r == bom && s.offset > 0 {
86 s.error(s.offset, "illegal byte order mark")
87 }
88 }
89 s.rdOffset += w
90 s.ch = r
91 } else {
92 s.offset = len(s.src)
93 if s.ch == '\n' {
94 s.lineOffset = s.offset
95 s.file.AddLine(s.offset)
96 }
97 s.ch = eof
98 }
99 }
100
101
102
103 func (s *Scanner) peek() byte {
104 if s.rdOffset < len(s.src) {
105 return s.src[s.rdOffset]
106 }
107 return 0
108 }
109
110
111
112 type Mode uint
113
114 const (
115 ScanComments Mode = 1 << iota
116 dontInsertSemis
117 )
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
134
135 if file.Size() != len(src) {
136 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
137 }
138 s.file = file
139 s.dir, _ = filepath.Split(file.Name())
140 s.src = src
141 s.err = err
142 s.mode = mode
143
144 s.ch = ' '
145 s.offset = 0
146 s.rdOffset = 0
147 s.lineOffset = 0
148 s.insertSemi = false
149 s.ErrorCount = 0
150
151 s.next()
152 if s.ch == bom {
153 s.next()
154 }
155 }
156
157 func (s *Scanner) error(offs int, msg string) {
158 if s.err != nil {
159 s.err(s.file.Position(s.file.Pos(offs)), msg)
160 }
161 s.ErrorCount++
162 }
163
164 func (s *Scanner) errorf(offs int, format string, args ...any) {
165 s.error(offs, fmt.Sprintf(format, args...))
166 }
167
168
169
170
171 func (s *Scanner) scanComment() (string, int) {
172
173 offs := s.offset - 1
174 next := -1
175 numCR := 0
176 nlOffset := 0
177
178 if s.ch == '/' {
179
180
181 s.next()
182 for s.ch != '\n' && s.ch >= 0 {
183 if s.ch == '\r' {
184 numCR++
185 }
186 s.next()
187 }
188
189 next = s.offset
190 if s.ch == '\n' {
191 next++
192 }
193 goto exit
194 }
195
196
197 s.next()
198 for s.ch >= 0 {
199 ch := s.ch
200 if ch == '\r' {
201 numCR++
202 } else if ch == '\n' && nlOffset == 0 {
203 nlOffset = s.offset
204 }
205 s.next()
206 if ch == '*' && s.ch == '/' {
207 s.next()
208 next = s.offset
209 goto exit
210 }
211 }
212
213 s.error(offs, "comment not terminated")
214
215 exit:
216 lit := s.src[offs:s.offset]
217
218
219
220
221
222
223 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
224 lit = lit[:len(lit)-1]
225 numCR--
226 }
227
228
229
230 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
231 s.updateLineInfo(next, offs, lit)
232 }
233
234 if numCR > 0 {
235 lit = stripCR(lit, lit[1] == '*')
236 }
237
238 return string(lit), nlOffset
239 }
240
241 var prefix = []byte("line ")
242
243
244
245
246 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
247
248 if text[1] == '*' {
249 text = text[:len(text)-2]
250 }
251 text = text[7:]
252 offs += 7
253
254 i, n, ok := trailingDigits(text)
255 if i == 0 {
256 return
257 }
258
259
260 if !ok {
261
262 s.error(offs+i, "invalid line number: "+string(text[i:]))
263 return
264 }
265
266
267
268
269 const maxLineCol = 1 << 30
270 var line, col int
271 i2, n2, ok2 := trailingDigits(text[:i-1])
272 if ok2 {
273
274 i, i2 = i2, i
275 line, col = n2, n
276 if col == 0 || col > maxLineCol {
277 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
278 return
279 }
280 text = text[:i2-1]
281 } else {
282
283 line = n
284 }
285
286 if line == 0 || line > maxLineCol {
287 s.error(offs+i, "invalid line number: "+string(text[i:]))
288 return
289 }
290
291
292
293 filename := string(text[:i-1])
294 if filename == "" && ok2 {
295 filename = s.file.Position(s.file.Pos(offs)).Filename
296 } else if filename != "" {
297
298
299
300 filename = filepath.Clean(filename)
301 if !filepath.IsAbs(filename) {
302 filename = filepath.Join(s.dir, filename)
303 }
304 }
305
306 s.file.AddLineColumnInfo(next, filename, line, col)
307 }
308
309 func trailingDigits(text []byte) (int, int, bool) {
310 i := bytes.LastIndexByte(text, ':')
311 if i < 0 {
312 return 0, 0, false
313 }
314
315 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
316 return i + 1, int(n), err == nil
317 }
318
319 func isLetter(ch rune) bool {
320 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
321 }
322
323 func isDigit(ch rune) bool {
324 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
325 }
326
327
328
329
330
331
332 func (s *Scanner) scanIdentifier() string {
333 offs := s.offset
334
335
336
337
338
339
340
341
342 for rdOffset, b := range s.src[s.rdOffset:] {
343 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
344
345 continue
346 }
347 s.rdOffset += rdOffset
348 if 0 < b && b < utf8.RuneSelf {
349
350
351
352
353
354
355 s.ch = rune(b)
356 s.offset = s.rdOffset
357 s.rdOffset++
358 goto exit
359 }
360
361
362
363 s.next()
364 for isLetter(s.ch) || isDigit(s.ch) {
365 s.next()
366 }
367 goto exit
368 }
369 s.offset = len(s.src)
370 s.rdOffset = len(s.src)
371 s.ch = eof
372
373 exit:
374 return string(s.src[offs:s.offset])
375 }
376
377 func digitVal(ch rune) int {
378 switch {
379 case '0' <= ch && ch <= '9':
380 return int(ch - '0')
381 case 'a' <= lower(ch) && lower(ch) <= 'f':
382 return int(lower(ch) - 'a' + 10)
383 }
384 return 16
385 }
386
387 func lower(ch rune) rune { return ('a' - 'A') | ch }
388 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
389 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
390
391
392
393
394
395
396
397 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
398 if base <= 10 {
399 max := rune('0' + base)
400 for isDecimal(s.ch) || s.ch == '_' {
401 ds := 1
402 if s.ch == '_' {
403 ds = 2
404 } else if s.ch >= max && *invalid < 0 {
405 *invalid = s.offset
406 }
407 digsep |= ds
408 s.next()
409 }
410 } else {
411 for isHex(s.ch) || s.ch == '_' {
412 ds := 1
413 if s.ch == '_' {
414 ds = 2
415 }
416 digsep |= ds
417 s.next()
418 }
419 }
420 return
421 }
422
423 func (s *Scanner) scanNumber() (token.Token, string) {
424 offs := s.offset
425 tok := token.ILLEGAL
426
427 base := 10
428 prefix := rune(0)
429 digsep := 0
430 invalid := -1
431
432
433 if s.ch != '.' {
434 tok = token.INT
435 if s.ch == '0' {
436 s.next()
437 switch lower(s.ch) {
438 case 'x':
439 s.next()
440 base, prefix = 16, 'x'
441 case 'o':
442 s.next()
443 base, prefix = 8, 'o'
444 case 'b':
445 s.next()
446 base, prefix = 2, 'b'
447 default:
448 base, prefix = 8, '0'
449 digsep = 1
450 }
451 }
452 digsep |= s.digits(base, &invalid)
453 }
454
455
456 if s.ch == '.' {
457 tok = token.FLOAT
458 if prefix == 'o' || prefix == 'b' {
459 s.error(s.offset, "invalid radix point in "+litname(prefix))
460 }
461 s.next()
462 digsep |= s.digits(base, &invalid)
463 }
464
465 if digsep&1 == 0 {
466 s.error(s.offset, litname(prefix)+" has no digits")
467 }
468
469
470 if e := lower(s.ch); e == 'e' || e == 'p' {
471 switch {
472 case e == 'e' && prefix != 0 && prefix != '0':
473 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
474 case e == 'p' && prefix != 'x':
475 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
476 }
477 s.next()
478 tok = token.FLOAT
479 if s.ch == '+' || s.ch == '-' {
480 s.next()
481 }
482 ds := s.digits(10, nil)
483 digsep |= ds
484 if ds&1 == 0 {
485 s.error(s.offset, "exponent has no digits")
486 }
487 } else if prefix == 'x' && tok == token.FLOAT {
488 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
489 }
490
491
492 if s.ch == 'i' {
493 tok = token.IMAG
494 s.next()
495 }
496
497 lit := string(s.src[offs:s.offset])
498 if tok == token.INT && invalid >= 0 {
499 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
500 }
501 if digsep&2 != 0 {
502 if i := invalidSep(lit); i >= 0 {
503 s.error(offs+i, "'_' must separate successive digits")
504 }
505 }
506
507 return tok, lit
508 }
509
510 func litname(prefix rune) string {
511 switch prefix {
512 case 'x':
513 return "hexadecimal literal"
514 case 'o', '0':
515 return "octal literal"
516 case 'b':
517 return "binary literal"
518 }
519 return "decimal literal"
520 }
521
522
523 func invalidSep(x string) int {
524 x1 := ' '
525 d := '.'
526 i := 0
527
528
529 if len(x) >= 2 && x[0] == '0' {
530 x1 = lower(rune(x[1]))
531 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
532 d = '0'
533 i = 2
534 }
535 }
536
537
538 for ; i < len(x); i++ {
539 p := d
540 d = rune(x[i])
541 switch {
542 case d == '_':
543 if p != '0' {
544 return i
545 }
546 case isDecimal(d) || x1 == 'x' && isHex(d):
547 d = '0'
548 default:
549 if p == '_' {
550 return i - 1
551 }
552 d = '.'
553 }
554 }
555 if d == '_' {
556 return len(x) - 1
557 }
558
559 return -1
560 }
561
562
563
564
565
566 func (s *Scanner) scanEscape(quote rune) bool {
567 offs := s.offset
568
569 var n int
570 var base, max uint32
571 switch s.ch {
572 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
573 s.next()
574 return true
575 case '0', '1', '2', '3', '4', '5', '6', '7':
576 n, base, max = 3, 8, 255
577 case 'x':
578 s.next()
579 n, base, max = 2, 16, 255
580 case 'u':
581 s.next()
582 n, base, max = 4, 16, unicode.MaxRune
583 case 'U':
584 s.next()
585 n, base, max = 8, 16, unicode.MaxRune
586 default:
587 msg := "unknown escape sequence"
588 if s.ch < 0 {
589 msg = "escape sequence not terminated"
590 }
591 s.error(offs, msg)
592 return false
593 }
594
595 var x uint32
596 for n > 0 {
597 d := uint32(digitVal(s.ch))
598 if d >= base {
599 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
600 if s.ch < 0 {
601 msg = "escape sequence not terminated"
602 }
603 s.error(s.offset, msg)
604 return false
605 }
606 x = x*base + d
607 s.next()
608 n--
609 }
610
611 if x > max || 0xD800 <= x && x < 0xE000 {
612 s.error(offs, "escape sequence is invalid Unicode code point")
613 return false
614 }
615
616 return true
617 }
618
619 func (s *Scanner) scanRune() string {
620
621 offs := s.offset - 1
622
623 valid := true
624 n := 0
625 for {
626 ch := s.ch
627 if ch == '\n' || ch < 0 {
628
629 if valid {
630 s.error(offs, "rune literal not terminated")
631 valid = false
632 }
633 break
634 }
635 s.next()
636 if ch == '\'' {
637 break
638 }
639 n++
640 if ch == '\\' {
641 if !s.scanEscape('\'') {
642 valid = false
643 }
644
645 }
646 }
647
648 if valid && n != 1 {
649 s.error(offs, "illegal rune literal")
650 }
651
652 return string(s.src[offs:s.offset])
653 }
654
655 func (s *Scanner) scanString() string {
656
657 offs := s.offset - 1
658
659 for {
660 ch := s.ch
661 if ch == '\n' || ch < 0 {
662 s.error(offs, "string literal not terminated")
663 break
664 }
665 s.next()
666 if ch == '"' {
667 break
668 }
669 if ch == '\\' {
670 s.scanEscape('"')
671 }
672 }
673
674 return string(s.src[offs:s.offset])
675 }
676
677 func stripCR(b []byte, comment bool) []byte {
678 c := make([]byte, len(b))
679 i := 0
680 for j, ch := range b {
681
682
683
684
685
686 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
687 c[i] = ch
688 i++
689 }
690 }
691 return c[:i]
692 }
693
694 func (s *Scanner) scanRawString() string {
695
696 offs := s.offset - 1
697
698 hasCR := false
699 for {
700 ch := s.ch
701 if ch < 0 {
702 s.error(offs, "raw string literal not terminated")
703 break
704 }
705 s.next()
706 if ch == '`' {
707 break
708 }
709 if ch == '\r' {
710 hasCR = true
711 }
712 }
713
714 lit := s.src[offs:s.offset]
715 if hasCR {
716 lit = stripCR(lit, false)
717 }
718
719 return string(lit)
720 }
721
722 func (s *Scanner) skipWhitespace() {
723 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
724 s.next()
725 }
726 }
727
728
729
730
731
732
733
734 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
735 if s.ch == '=' {
736 s.next()
737 return tok1
738 }
739 return tok0
740 }
741
742 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
743 if s.ch == '=' {
744 s.next()
745 return tok1
746 }
747 if s.ch == ch2 {
748 s.next()
749 return tok2
750 }
751 return tok0
752 }
753
754 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
755 if s.ch == '=' {
756 s.next()
757 return tok1
758 }
759 if s.ch == ch2 {
760 s.next()
761 if s.ch == '=' {
762 s.next()
763 return tok3
764 }
765 return tok2
766 }
767 return tok0
768 }
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
801 scanAgain:
802 if s.nlPos.IsValid() {
803
804
805 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
806 s.nlPos = token.NoPos
807 return
808 }
809
810 s.skipWhitespace()
811
812
813 pos = s.file.Pos(s.offset)
814
815
816 insertSemi := false
817 switch ch := s.ch; {
818 case isLetter(ch):
819 lit = s.scanIdentifier()
820 if len(lit) > 1 {
821
822 tok = token.Lookup(lit)
823 switch tok {
824 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
825 insertSemi = true
826 }
827 } else {
828 insertSemi = true
829 tok = token.IDENT
830 }
831 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
832 insertSemi = true
833 tok, lit = s.scanNumber()
834 default:
835 s.next()
836 switch ch {
837 case eof:
838 if s.insertSemi {
839 s.insertSemi = false
840 return pos, token.SEMICOLON, "\n"
841 }
842 tok = token.EOF
843 case '\n':
844
845
846
847 s.insertSemi = false
848 return pos, token.SEMICOLON, "\n"
849 case '"':
850 insertSemi = true
851 tok = token.STRING
852 lit = s.scanString()
853 case '\'':
854 insertSemi = true
855 tok = token.CHAR
856 lit = s.scanRune()
857 case '`':
858 insertSemi = true
859 tok = token.STRING
860 lit = s.scanRawString()
861 case ':':
862 tok = s.switch2(token.COLON, token.DEFINE)
863 case '.':
864
865 tok = token.PERIOD
866 if s.ch == '.' && s.peek() == '.' {
867 s.next()
868 s.next()
869 tok = token.ELLIPSIS
870 }
871 case ',':
872 tok = token.COMMA
873 case ';':
874 tok = token.SEMICOLON
875 lit = ";"
876 case '(':
877 tok = token.LPAREN
878 case ')':
879 insertSemi = true
880 tok = token.RPAREN
881 case '[':
882 tok = token.LBRACK
883 case ']':
884 insertSemi = true
885 tok = token.RBRACK
886 case '{':
887 tok = token.LBRACE
888 case '}':
889 insertSemi = true
890 tok = token.RBRACE
891 case '+':
892 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
893 if tok == token.INC {
894 insertSemi = true
895 }
896 case '-':
897 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
898 if tok == token.DEC {
899 insertSemi = true
900 }
901 case '*':
902 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
903 case '/':
904 if s.ch == '/' || s.ch == '*' {
905
906 comment, nlOffset := s.scanComment()
907 if s.insertSemi && nlOffset != 0 {
908
909
910 s.nlPos = s.file.Pos(nlOffset)
911 s.insertSemi = false
912 } else {
913 insertSemi = s.insertSemi
914 }
915 if s.mode&ScanComments == 0 {
916
917 goto scanAgain
918 }
919 tok = token.COMMENT
920 lit = comment
921 } else {
922
923 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
924 }
925 case '%':
926 tok = s.switch2(token.REM, token.REM_ASSIGN)
927 case '^':
928 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
929 case '<':
930 if s.ch == '-' {
931 s.next()
932 tok = token.ARROW
933 } else {
934 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
935 }
936 case '>':
937 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
938 case '=':
939 tok = s.switch2(token.ASSIGN, token.EQL)
940 case '!':
941 tok = s.switch2(token.NOT, token.NEQ)
942 case '&':
943 if s.ch == '^' {
944 s.next()
945 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
946 } else {
947 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
948 }
949 case '|':
950 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
951 case '~':
952 tok = token.TILDE
953 default:
954
955 if ch != bom {
956
957
958 if ch == '“' || ch == '”' {
959 s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
960 } else {
961 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
962 }
963 }
964 insertSemi = s.insertSemi
965 tok = token.ILLEGAL
966 lit = string(ch)
967 }
968 }
969 if s.mode&dontInsertSemis == 0 {
970 s.insertSemi = insertSemi
971 }
972
973 return
974 }
975
View as plain text