// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ignore package main import ( "bytes" _ "embed" "fmt" "go/format" "io" "log" "maps" "os" "slices" "strconv" "strings" ) // We embed this source file in the resulting code-generation program in order // to extract the definitions of the encoding type and constants from it and // include them in the generated file. // //go:embed gen_encoding_table.go var genSource string const filename = "encoding_table.go" func main() { var out bytes.Buffer fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.") fmt.Fprintln(&out) fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.") fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style") fmt.Fprintln(&out, "// license that can be found in the LICENSE file.") fmt.Fprintln(&out) fmt.Fprintln(&out, "package url") fmt.Fprintln(&out) generateEnc(&out, genSource) generateTable(&out) formatted, err := format.Source(out.Bytes()) if err != nil { log.Fatal("format:", err) } err = os.WriteFile(filename, formatted, 0644) if err != nil { log.Fatal("WriteFile:", err) } } func generateEnc(w io.Writer, src string) { var writeLine bool for line := range strings.Lines(src) { if strings.HasPrefix(line, "// START encoding") { writeLine = true continue } if strings.HasPrefix(line, "// END encoding") { return } if writeLine { fmt.Fprint(w, line) } } } func generateTable(w io.Writer) { fmt.Fprintln(w, "var table = [256]encoding{") // Sort the encodings (in decreasing order) to guarantee a stable output. sortedEncs := slices.Sorted(maps.Keys(encNames)) slices.Reverse(sortedEncs) for i := range 256 { c := byte(i) var lineBuf bytes.Buffer // Write key to line buffer. lineBuf.WriteString(strconv.QuoteRune(rune(c))) lineBuf.WriteByte(':') // Write value to line buffer. blankVal := true if ishex(c) { // Set the hexChar bit if this char is hexadecimal. lineBuf.WriteString("hexChar") blankVal = false } for _, enc := range sortedEncs { if !shouldEscape(c, enc) { if !blankVal { lineBuf.WriteByte('|') } // Set this encoding mode's bit if this char should NOT be // escaped. name := encNames[enc] lineBuf.WriteString(name) blankVal = false } } if !blankVal { lineBuf.WriteString(",\n") w.Write(lineBuf.Bytes()) } } fmt.Fprintln(w, "}") } // START encoding (keep this marker comment in sync with genEnc) type encoding uint8 const ( encodePath encoding = 1 << iota encodePathSegment encodeHost encodeZone encodeUserPassword encodeQueryComponent encodeFragment // hexChar is actually NOT an encoding mode, but there are only seven // encoding modes. We might as well abuse the otherwise unused most // significant bit in uint8 to indicate whether a character is // hexadecimal. hexChar ) // END encoding (keep this marker comment in sync with genEnc) // Keep this in sync with the definitions of encoding mode constants. var encNames = map[encoding]string{ encodePath: "encodePath", encodePathSegment: "encodePathSegment", encodeHost: "encodeHost", encodeZone: "encodeZone", encodeUserPassword: "encodeUserPassword", encodeQueryComponent: "encodeQueryComponent", encodeFragment: "encodeFragment", } // Return true if the specified character should be escaped when // appearing in a URL string, according to RFC 3986. // // Please be informed that for now shouldEscape does not check all // reserved characters correctly. See golang.org/issue/5684. func shouldEscape(c byte, mode encoding) bool { // §2.3 Unreserved characters (alphanum) if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { return false } if mode == encodeHost || mode == encodeZone { // §3.2.2 Host allows // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // as part of reg-name. // We add : because we include :port as part of host. // We add [ ] because we include [ipv6]:port as part of host. // We add < > because they're the only characters left that // we could possibly allow, and Parse will reject them if we // escape them (because hosts can't use %-encoding for // ASCII bytes). switch c { case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"': return false } } switch c { case '-', '_', '.', '~': // §2.3 Unreserved characters (mark) return false case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved) // Different sections of the URL allow a few of // the reserved characters to appear unescaped. switch mode { case encodePath: // §3.3 // The RFC allows : @ & = + $ but saves / ; , for assigning // meaning to individual path segments. This package // only manipulates the path as a whole, so we allow those // last three as well. That leaves only ? to escape. return c == '?' case encodePathSegment: // §3.3 // The RFC allows : @ & = + $ but saves / ; , for assigning // meaning to individual path segments. return c == '/' || c == ';' || c == ',' || c == '?' case encodeUserPassword: // §3.2.1 // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in // userinfo, so we must escape only '@', '/', and '?'. // The parsing of userinfo treats ':' as special so we must escape // that too. return c == '@' || c == '/' || c == '?' || c == ':' case encodeQueryComponent: // §3.4 // The RFC reserves (so we must escape) everything. return true case encodeFragment: // §4.1 // The RFC text is silent but the grammar allows // everything, so escape nothing. return false } } if mode == encodeFragment { // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not // need to be escaped. To minimize potential breakage, we apply two restrictions: // (1) we always escape sub-delims outside of the fragment, and (2) we always // escape single quote to avoid breaking callers that had previously assumed that // single quotes would be escaped. See issue #19917. switch c { case '!', '(', ')', '*': return false } } // Everything else must be escaped. return true } func ishex(c byte) bool { return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' }