Source file src/net/url/gen_encoding_table.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  import (
    10  	"bytes"
    11  	_ "embed"
    12  	"fmt"
    13  	"go/format"
    14  	"io"
    15  	"log"
    16  	"maps"
    17  	"os"
    18  	"slices"
    19  	"strconv"
    20  	"strings"
    21  )
    22  
    23  // We embed this source file in the resulting code-generation program in order
    24  // to extract the definitions of the encoding type and constants from it and
    25  // include them in the generated file.
    26  //
    27  //go:embed gen_encoding_table.go
    28  var genSource string
    29  
    30  const filename = "encoding_table.go"
    31  
    32  func main() {
    33  	var out bytes.Buffer
    34  	fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
    35  	fmt.Fprintln(&out)
    36  	fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
    37  	fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
    38  	fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
    39  	fmt.Fprintln(&out)
    40  	fmt.Fprintln(&out, "package url")
    41  	fmt.Fprintln(&out)
    42  	generateEnc(&out, genSource)
    43  	generateTable(&out)
    44  
    45  	formatted, err := format.Source(out.Bytes())
    46  	if err != nil {
    47  		log.Fatal("format:", err)
    48  	}
    49  
    50  	err = os.WriteFile(filename, formatted, 0644)
    51  	if err != nil {
    52  		log.Fatal("WriteFile:", err)
    53  	}
    54  }
    55  
    56  func generateEnc(w io.Writer, src string) {
    57  	var writeLine bool
    58  	for line := range strings.Lines(src) {
    59  		if strings.HasPrefix(line, "// START encoding") {
    60  			writeLine = true
    61  			continue
    62  		}
    63  		if strings.HasPrefix(line, "// END encoding") {
    64  			return
    65  		}
    66  		if writeLine {
    67  			fmt.Fprint(w, line)
    68  		}
    69  	}
    70  }
    71  
    72  func generateTable(w io.Writer) {
    73  	fmt.Fprintln(w, "var table = [256]encoding{")
    74  
    75  	// Sort the encodings (in decreasing order) to guarantee a stable output.
    76  	sortedEncs := slices.Sorted(maps.Keys(encNames))
    77  	slices.Reverse(sortedEncs)
    78  
    79  	for i := range 256 {
    80  		c := byte(i)
    81  		var lineBuf bytes.Buffer
    82  
    83  		// Write key to line buffer.
    84  		lineBuf.WriteString(strconv.QuoteRune(rune(c)))
    85  
    86  		lineBuf.WriteByte(':')
    87  
    88  		// Write value to line buffer.
    89  		blankVal := true
    90  		if ishex(c) {
    91  			// Set the hexChar bit if this char is hexadecimal.
    92  			lineBuf.WriteString("hexChar")
    93  			blankVal = false
    94  		}
    95  		for _, enc := range sortedEncs {
    96  			if !shouldEscape(c, enc) {
    97  				if !blankVal {
    98  					lineBuf.WriteByte('|')
    99  				}
   100  				// Set this encoding mode's bit if this char should NOT be
   101  				// escaped.
   102  				name := encNames[enc]
   103  				lineBuf.WriteString(name)
   104  				blankVal = false
   105  			}
   106  		}
   107  
   108  		if !blankVal {
   109  			lineBuf.WriteString(",\n")
   110  			w.Write(lineBuf.Bytes())
   111  		}
   112  	}
   113  	fmt.Fprintln(w, "}")
   114  }
   115  
   116  // START encoding (keep this marker comment in sync with genEnc)
   117  type encoding uint8
   118  
   119  const (
   120  	encodePath encoding = 1 << iota
   121  	encodePathSegment
   122  	encodeHost
   123  	encodeZone
   124  	encodeUserPassword
   125  	encodeQueryComponent
   126  	encodeFragment
   127  
   128  	// hexChar is actually NOT an encoding mode, but there are only seven
   129  	// encoding modes. We might as well abuse the otherwise unused most
   130  	// significant bit in uint8 to indicate whether a character is
   131  	// hexadecimal.
   132  	hexChar
   133  )
   134  
   135  // END encoding (keep this marker comment in sync with genEnc)
   136  
   137  // Keep this in sync with the definitions of encoding mode constants.
   138  var encNames = map[encoding]string{
   139  	encodePath:           "encodePath",
   140  	encodePathSegment:    "encodePathSegment",
   141  	encodeHost:           "encodeHost",
   142  	encodeZone:           "encodeZone",
   143  	encodeUserPassword:   "encodeUserPassword",
   144  	encodeQueryComponent: "encodeQueryComponent",
   145  	encodeFragment:       "encodeFragment",
   146  }
   147  
   148  // Return true if the specified character should be escaped when
   149  // appearing in a URL string, according to RFC 3986.
   150  //
   151  // Please be informed that for now shouldEscape does not check all
   152  // reserved characters correctly. See golang.org/issue/5684.
   153  func shouldEscape(c byte, mode encoding) bool {
   154  	// §2.3 Unreserved characters (alphanum)
   155  	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   156  		return false
   157  	}
   158  
   159  	if mode == encodeHost || mode == encodeZone {
   160  		// §3.2.2 Host allows
   161  		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
   162  		// as part of reg-name.
   163  		// We add : because we include :port as part of host.
   164  		// We add [ ] because we include [ipv6]:port as part of host.
   165  		// We add < > because they're the only characters left that
   166  		// we could possibly allow, and Parse will reject them if we
   167  		// escape them (because hosts can't use %-encoding for
   168  		// ASCII bytes).
   169  		switch c {
   170  		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
   171  			return false
   172  		}
   173  	}
   174  
   175  	switch c {
   176  	case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
   177  		return false
   178  
   179  	case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
   180  		// Different sections of the URL allow a few of
   181  		// the reserved characters to appear unescaped.
   182  		switch mode {
   183  		case encodePath: // §3.3
   184  			// The RFC allows : @ & = + $ but saves / ; , for assigning
   185  			// meaning to individual path segments. This package
   186  			// only manipulates the path as a whole, so we allow those
   187  			// last three as well. That leaves only ? to escape.
   188  			return c == '?'
   189  
   190  		case encodePathSegment: // §3.3
   191  			// The RFC allows : @ & = + $ but saves / ; , for assigning
   192  			// meaning to individual path segments.
   193  			return c == '/' || c == ';' || c == ',' || c == '?'
   194  
   195  		case encodeUserPassword: // §3.2.1
   196  			// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
   197  			// userinfo, so we must escape only '@', '/', and '?'.
   198  			// The parsing of userinfo treats ':' as special so we must escape
   199  			// that too.
   200  			return c == '@' || c == '/' || c == '?' || c == ':'
   201  
   202  		case encodeQueryComponent: // §3.4
   203  			// The RFC reserves (so we must escape) everything.
   204  			return true
   205  
   206  		case encodeFragment: // §4.1
   207  			// The RFC text is silent but the grammar allows
   208  			// everything, so escape nothing.
   209  			return false
   210  		}
   211  	}
   212  
   213  	if mode == encodeFragment {
   214  		// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
   215  		// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
   216  		// need to be escaped. To minimize potential breakage, we apply two restrictions:
   217  		// (1) we always escape sub-delims outside of the fragment, and (2) we always
   218  		// escape single quote to avoid breaking callers that had previously assumed that
   219  		// single quotes would be escaped. See issue #19917.
   220  		switch c {
   221  		case '!', '(', ')', '*':
   222  			return false
   223  		}
   224  	}
   225  
   226  	// Everything else must be escaped.
   227  	return true
   228  }
   229  
   230  func ishex(c byte) bool {
   231  	return '0' <= c && c <= '9' ||
   232  		'a' <= c && c <= 'f' ||
   233  		'A' <= c && c <= 'F'
   234  }
   235  

View as plain text