|
|
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates table.go and table_test.go based on the authoritative
// public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
//
// The version is derived from
// https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
// and a human-readable form is at
// https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
//
// To fetch a particular git revision, such as 5c70ccd250, pass
// -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
// and -version "an explicit version string".
import ( "bufio" "bytes" "flag" "fmt" "go/format" "io" "io/ioutil" "net/http" "os" "regexp" "sort" "strings"
"golang.org/x/net/idna" )
const ( // These sum of these four values must be no greater than 32.
nodesBitsChildren = 10 nodesBitsICANN = 1 nodesBitsTextOffset = 15 nodesBitsTextLength = 6
// These sum of these four values must be no greater than 32.
childrenBitsWildcard = 1 childrenBitsNodeType = 2 childrenBitsHi = 14 childrenBitsLo = 14 )
var ( maxChildren int maxTextOffset int maxTextLength int maxHi uint32 maxLo uint32 )
func max(a, b int) int { if a < b { return b } return a }
func u32max(a, b uint32) uint32 { if a < b { return b } return a }
const ( nodeTypeNormal = 0 nodeTypeException = 1 nodeTypeParentOnly = 2 numNodeType = 3 )
func nodeTypeStr(n int) string { switch n { case nodeTypeNormal: return "+" case nodeTypeException: return "!" case nodeTypeParentOnly: return "o" } panic("unreachable") }
const ( defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat" gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat" )
var ( labelEncoding = map[string]uint32{} labelsList = []string{} labelsMap = map[string]bool{} rules = []string{}
// validSuffixRE is used to check that the entries in the public suffix
// list are in canonical form (after Punycode encoding). Specifically,
// capital letters are not allowed.
validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
shaRE = regexp.MustCompile(`"sha":"([^"]+)"`) dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
comments = flag.Bool("comments", false, "generate table.go comments, for debugging") subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead") v = flag.Bool("v", false, "verbose output (to stderr)") version = flag.String("version", "", "the effective_tld_names.dat version") )
func main() { if err := main1(); err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } }
func main1() error { flag.Parse() if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 { return fmt.Errorf("not enough bits to encode the nodes table") } if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 { return fmt.Errorf("not enough bits to encode the children table") } if *version == "" { if *url != defaultURL { return fmt.Errorf("-version was not specified, and the -url is not the default one") } sha, date, err := gitCommit() if err != nil { return err } *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date) } var r io.Reader = os.Stdin if *url != "" { res, err := http.Get(*url) if err != nil { return err } if res.StatusCode != http.StatusOK { return fmt.Errorf("bad GET status for %s: %d", *url, res.Status) } r = res.Body defer res.Body.Close() }
var root node icann := false br := bufio.NewReader(r) for { s, err := br.ReadString('\n') if err != nil { if err == io.EOF { break } return err } s = strings.TrimSpace(s) if strings.Contains(s, "BEGIN ICANN DOMAINS") { icann = true continue } if strings.Contains(s, "END ICANN DOMAINS") { icann = false continue } if s == "" || strings.HasPrefix(s, "//") { continue } s, err = idna.ToASCII(s) if err != nil { return err } if !validSuffixRE.MatchString(s) { return fmt.Errorf("bad publicsuffix.org list data: %q", s) }
if *subset { switch { case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"): case s == "ak.us" || strings.HasSuffix(s, ".ak.us"): case s == "ao" || strings.HasSuffix(s, ".ao"): case s == "ar" || strings.HasSuffix(s, ".ar"): case s == "arpa" || strings.HasSuffix(s, ".arpa"): case s == "cy" || strings.HasSuffix(s, ".cy"): case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"): case s == "jp": case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): case s == "om" || strings.HasSuffix(s, ".om"): case s == "uk" || strings.HasSuffix(s, ".uk"): case s == "uk.com" || strings.HasSuffix(s, ".uk.com"): case s == "tw" || strings.HasSuffix(s, ".tw"): case s == "zw" || strings.HasSuffix(s, ".zw"): case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): // xn--p1ai is Russian-Cyrillic "рф".
default: continue } }
rules = append(rules, s)
nt, wildcard := nodeTypeNormal, false switch { case strings.HasPrefix(s, "*."): s, nt = s[2:], nodeTypeParentOnly wildcard = true case strings.HasPrefix(s, "!"): s, nt = s[1:], nodeTypeException } labels := strings.Split(s, ".") for n, i := &root, len(labels)-1; i >= 0; i-- { label := labels[i] n = n.child(label) if i == 0 { if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { n.nodeType = nt } n.icann = n.icann && icann n.wildcard = n.wildcard || wildcard } labelsMap[label] = true } } labelsList = make([]string, 0, len(labelsMap)) for label := range labelsMap { labelsList = append(labelsList, label) } sort.Strings(labelsList)
if err := generate(printReal, &root, "table.go"); err != nil { return err } if err := generate(printTest, &root, "table_test.go"); err != nil { return err } return nil }
func generate(p func(io.Writer, *node) error, root *node, filename string) error { buf := new(bytes.Buffer) if err := p(buf, root); err != nil { return err } b, err := format.Source(buf.Bytes()) if err != nil { return err } return ioutil.WriteFile(filename, b, 0644) }
func gitCommit() (sha, date string, retErr error) { res, err := http.Get(gitCommitURL) if err != nil { return "", "", err } if res.StatusCode != http.StatusOK { return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status) } defer res.Body.Close() b, err := ioutil.ReadAll(res.Body) if err != nil { return "", "", err } if m := shaRE.FindSubmatch(b); m != nil { sha = string(m[1]) } if m := dateRE.FindSubmatch(b); m != nil { date = string(m[1]) } if sha == "" || date == "" { retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL) } return sha, date, retErr }
func printTest(w io.Writer, n *node) error { fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n") for _, rule := range rules { fmt.Fprintf(w, "%q,\n", rule) } fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") if err := n.walk(w, printNodeLabel); err != nil { return err } fmt.Fprintf(w, "}\n") return nil }
func printReal(w io.Writer, n *node) error { const header = `// generated by go run gen.go; DO NOT EDIT
package publicsuffix
const version = %q
const ( nodesBitsChildren = %d nodesBitsICANN = %d nodesBitsTextOffset = %d nodesBitsTextLength = %d
childrenBitsWildcard = %d childrenBitsNodeType = %d childrenBitsHi = %d childrenBitsLo = %d )
const ( nodeTypeNormal = %d nodeTypeException = %d nodeTypeParentOnly = %d )
// numTLD is the number of top level domains.
const numTLD = %d
` fmt.Fprintf(w, header, *version, nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength, childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo, nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
text := combineText(labelsList) if text == "" { return fmt.Errorf("internal error: makeText returned no text") } for _, label := range labelsList { offset, length := strings.Index(text, label), len(label) if offset < 0 { return fmt.Errorf("internal error: could not find %q in text %q", label, text) } maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length) if offset >= 1<<nodesBitsTextOffset { return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset) } if length >= 1<<nodesBitsTextLength { return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length) } labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length) } fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") for len(text) > 0 { n, plus := len(text), "" if n > 64 { n, plus = 64, " +" } fmt.Fprintf(w, "%q%s\n", text[:n], plus) text = text[n:] }
if err := n.walk(w, assignIndexes); err != nil { return err }
fmt.Fprintf(w, `
// nodes is the list of nodes. Each node is represented as a uint32, which
// encodes the node's children, wildcard bit and node type (as an index into
// the children array), ICANN bit and text.
//
// If the table was generated with the -comments flag, there is a //-comment
// after each node's data. In it is the nodes-array indexes of the children,
// formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
// nodeType is printed as + for normal, ! for exception, and o for parent-only
// nodes that have children but don't match a domain label in their own right.
// An I denotes an ICANN domain.
//
// The layout within the uint32, from MSB to LSB, is:
// [%2d bits] unused
// [%2d bits] children index
// [%2d bits] ICANN bit
// [%2d bits] text index
// [%2d bits] text length
var nodes = [...]uint32{ `, 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength, nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength) if err := n.walk(w, printNode); err != nil { return err } fmt.Fprintf(w, `}
// children is the list of nodes' children, the parent's wildcard bit and the
// parent's node type. If a node has no children then their children index
// will be in the range [0, 6), depending on the wildcard bit and node type.
//
// The layout within the uint32, from MSB to LSB, is:
// [%2d bits] unused
// [%2d bits] wildcard bit
// [%2d bits] node type
// [%2d bits] high nodes index (exclusive) of children
// [%2d bits] low nodes index (inclusive) of children
var children=[...]uint32{ `, 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo, childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo) for i, c := range childrenEncoding { s := "---------------" lo := c & (1<<childrenBitsLo - 1) hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1) if lo != hi { s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi) } nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1) wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0 if *comments { fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n", c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType)) } else { fmt.Fprintf(w, "0x%x,\n", c) } } fmt.Fprintf(w, "}\n\n") fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1) fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1) fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1) fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1) fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1) return nil }
type node struct { label string nodeType int icann bool wildcard bool // nodesIndex and childrenIndex are the index of this node in the nodes
// and the index of its children offset/length in the children arrays.
nodesIndex, childrenIndex int // firstChild is the index of this node's first child, or zero if this
// node has no children.
firstChild int // children are the node's children, in strictly increasing node label order.
children []*node }
func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { if err := f(w, n); err != nil { return err } for _, c := range n.children { if err := c.walk(w, f); err != nil { return err } } return nil }
// child returns the child of n with the given label. The child is created if
// it did not exist beforehand.
func (n *node) child(label string) *node { for _, c := range n.children { if c.label == label { return c } } c := &node{ label: label, nodeType: nodeTypeParentOnly, icann: true, } n.children = append(n.children, c) sort.Sort(byLabel(n.children)) return c }
type byLabel []*node
func (b byLabel) Len() int { return len(b) } func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
var nextNodesIndex int
// childrenEncoding are the encoded entries in the generated children array.
// All these pre-defined entries have no children.
var childrenEncoding = []uint32{ 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
}
var firstCallToAssignIndexes = true
func assignIndexes(w io.Writer, n *node) error { if len(n.children) != 0 { // Assign nodesIndex.
n.firstChild = nextNodesIndex for _, c := range n.children { c.nodesIndex = nextNodesIndex nextNodesIndex++ }
// The root node's children is implicit.
if firstCallToAssignIndexes { firstCallToAssignIndexes = false return nil }
// Assign childrenIndex.
maxChildren = max(maxChildren, len(childrenEncoding)) if len(childrenEncoding) >= 1<<nodesBitsChildren { return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding)) } n.childrenIndex = len(childrenEncoding) lo := uint32(n.firstChild) hi := lo + uint32(len(n.children)) maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi) if lo >= 1<<childrenBitsLo { return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo) } if hi >= 1<<childrenBitsHi { return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi) } enc := hi<<childrenBitsLo | lo enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi) if n.wildcard { enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType) } childrenEncoding = append(childrenEncoding, enc) } else { n.childrenIndex = n.nodeType if n.wildcard { n.childrenIndex += numNodeType } } return nil }
func printNode(w io.Writer, n *node) error { for _, c := range n.children { s := "---------------" if len(c.children) != 0 { s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children)) } encoding := labelEncoding[c.label] if c.icann { encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset) } encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN) if *comments { fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n", encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard), nodeTypeStr(c.nodeType), icannStr(c.icann), c.label, ) } else { fmt.Fprintf(w, "0x%x,\n", encoding) } } return nil }
func printNodeLabel(w io.Writer, n *node) error { for _, c := range n.children { fmt.Fprintf(w, "%q,\n", c.label) } return nil }
func icannStr(icann bool) string { if icann { return "I" } return " " }
func wildcardStr(wildcard bool) string { if wildcard { return "*" } return " " }
// combineText combines all the strings in labelsList to form one giant string.
// Overlapping strings will be merged: "arpa" and "parliament" could yield
// "arparliament".
func combineText(labelsList []string) string { beforeLength := 0 for _, s := range labelsList { beforeLength += len(s) }
text := crush(removeSubstrings(labelsList)) if *v { fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) } return text }
type byLength []string
func (s byLength) Len() int { return len(s) } func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
// removeSubstrings returns a copy of its input with any strings removed
// that are substrings of other provided strings.
func removeSubstrings(input []string) []string { // Make a copy of input.
ss := append(make([]string, 0, len(input)), input...) sort.Sort(byLength(ss))
for i, shortString := range ss { // For each string, only consider strings higher than it in sort order, i.e.
// of equal length or greater.
for _, longString := range ss[i+1:] { if strings.Contains(longString, shortString) { ss[i] = "" break } } }
// Remove the empty strings.
sort.Strings(ss) for len(ss) > 0 && ss[0] == "" { ss = ss[1:] } return ss }
// crush combines a list of strings, taking advantage of overlaps. It returns a
// single string that contains each input string as a substring.
func crush(ss []string) string { maxLabelLen := 0 for _, s := range ss { if maxLabelLen < len(s) { maxLabelLen = len(s) } }
for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- { prefixes := makePrefixMap(ss, prefixLen) for i, s := range ss { if len(s) <= prefixLen { continue } mergeLabel(ss, i, prefixLen, prefixes) } }
return strings.Join(ss, "") }
// mergeLabel merges the label at ss[i] with the first available matching label
// in prefixMap, where the last "prefixLen" characters in ss[i] match the first
// "prefixLen" characters in the matching label.
// It will merge ss[i] repeatedly until no more matches are available.
// All matching labels merged into ss[i] are replaced by "".
func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) { s := ss[i] suffix := s[len(s)-prefixLen:] for _, j := range prefixes[suffix] { // Empty strings mean "already used." Also avoid merging with self.
if ss[j] == "" || i == j { continue } if *v { fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n", prefixLen, i, j, ss[i], ss[j], suffix) } ss[i] += ss[j][prefixLen:] ss[j] = "" // ss[i] has a new suffix, so merge again if possible.
// Note: we only have to merge again at the same prefix length. Shorter
// prefix lengths will be handled in the next iteration of crush's for loop.
// Can there be matches for longer prefix lengths, introduced by the merge?
// I believe that any such matches would by necessity have been eliminated
// during substring removal or merged at a higher prefix length. For
// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
// would yield "abcde", which could be merged with "bcdef." However, in
// practice "cde" would already have been elimintated by removeSubstrings.
mergeLabel(ss, i, prefixLen, prefixes) return } }
// prefixMap maps from a prefix to a list of strings containing that prefix. The
// list of strings is represented as indexes into a slice of strings stored
// elsewhere.
type prefixMap map[string][]int
// makePrefixMap constructs a prefixMap from a slice of strings.
func makePrefixMap(ss []string, prefixLen int) prefixMap { prefixes := make(prefixMap) for i, s := range ss { // We use < rather than <= because if a label matches on a prefix equal to
// its full length, that's actually a substring match handled by
// removeSubstrings.
if prefixLen < len(s) { prefix := s[:prefixLen] prefixes[prefix] = append(prefixes[prefix], i) } }
return prefixes }
|