diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..69e69b6 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,17 @@ +version: "2" + +linters: + enable: + - gocritic + - gocognit + - gocyclo + - maintidx + - dupl + - mnd + - unparam + - ireturn + - goconst + - errcheck + settings: + goconst: + ignore-tests: true diff --git a/README.md b/README.md index bee9e45..d52639b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The `licensecheck` package scans source texts for known licenses. The design aims never to give a false positive. It also reports matches of known license URLs. -See the [package documentation](https://pkg.go.dev/github.com/google/licensecheck) +See the [package documentation](https://pkg.go.dev/github.com/git-pkgs/licensecheck) for API details. The license scanner recognizes nearly all the licenses gathered by the SPDX project, diff --git a/builtin.dfa b/builtin.dfa new file mode 100644 index 0000000..1502042 Binary files /dev/null and b/builtin.dfa differ diff --git a/builtin.dfa.triv b/builtin.dfa.triv new file mode 100644 index 0000000..50f5df1 Binary files /dev/null and b/builtin.dfa.triv differ diff --git a/data.gen.go b/data.gen.go index c472559..f3f75ce 100644 --- a/data.gen.go +++ b/data.gen.go @@ -9186,7 +9186,7 @@ parties intellectual property rights. ` const license_BSD_4_Clause_UC_lre = ` //** -BSD 4-Clause (Univeristy of California-Specific) +BSD 4-Clause (University of California-Specific) https://spdx.org/licenses/BSD-4-Clause-UC.json http://www.freebsd.org/copyright/license.html **// diff --git a/gen_data.go b/gen_data.go index cfa7a4d..816ed1d 100644 --- a/gen_data.go +++ b/gen_data.go @@ -2,9 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build ignore +//go:build ignore -// This file generates data.gen.go. +// This file generates data.gen.go and builtin.dfa. // It embeds the text of all the licenses in the subdirectory "licenses" // and constructs the data structures to represent them. // Run by a "go:generate" comment in license.go. @@ -16,14 +16,15 @@ import ( "flag" "fmt" "go/format" - "io/ioutil" "log" + "os" "path/filepath" "sort" "strings" "text/template" - "github.com/google/licensecheck" + "github.com/git-pkgs/licensecheck" + "github.com/git-pkgs/licensecheck/internal/match" ) var outFile = flag.String("o", "data.gen.go", "`file` to write") @@ -59,7 +60,7 @@ func main() { src, err := format.Source([]byte(code)) if err != nil { - fd, err1 := ioutil.TempFile("", "license-data") + fd, err1 := os.CreateTemp("", "license-data") if err1 == nil { _, err1 = fd.Write([]byte(code)) if err1 == nil { @@ -69,10 +70,32 @@ func main() { } log.Fatal("parsing output:", err) } - err = ioutil.WriteFile(*outFile, src, 0644) + err = os.WriteFile(*outFile, src, 0644) if err != nil { log.Fatal(err) } + + // Build and write the precomputed DFA. + d := new(match.Dict) + d.Insert("copyright") + d.Insert("http") + var lres []*match.LRE + for _, file := range builtLRE { + re, err := match.ParseLRE(d, file.Name, string(file.Data)) + if err != nil { + log.Fatalf("parsing LRE %s for DFA: %v", file.Name, err) + } + lres = append(lres, re) + } + multi, err := match.NewMultiLRE(lres) + if err != nil { + log.Fatal("building MultiLRE:", err) + } + dfaData := match.MarshalMultiLRE(multi) + if err := os.WriteFile("builtin.dfa", dfaData, 0644); err != nil { + log.Fatal("writing builtin.dfa:", err) + } + log.Printf("builtin.dfa: %d bytes", len(dfaData)) } // varName returns the basename of the file, sanitized for use as a variable name, diff --git a/go.mod b/go.mod index ae7ca5f..b12d6e9 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ -module github.com/google/licensecheck +module github.com/git-pkgs/licensecheck -go 1.12 +go 1.25 diff --git a/internal/match/dict.go b/internal/match/dict.go index fa38505..982372a 100644 --- a/internal/match/dict.go +++ b/internal/match/dict.go @@ -383,11 +383,11 @@ func markdownAnchorSize(t string) int { return 0 } i := 2 - for ; i < len(t); i++ { + for ; i < len(t) && i < 256; i++ { switch t[i] { case '}': return i + 1 - case ' ', '\r', '\n': + case ' ', '\r', '\n', '{': return 0 } } @@ -419,9 +419,9 @@ func markdownLinkSize(t string) int { return 0 } - for i := 2; i < len(t); i++ { + for i := 2; i < len(t) && i < 2048; i++ { c := t[i] - if c == ' ' || c == '\t' || c == '\r' || c == '\n' { + if c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == ']' { return 0 } if c == ')' { diff --git a/internal/match/dict_test.go b/internal/match/dict_test.go index ffa97b6..76826a9 100644 --- a/internal/match/dict_test.go +++ b/internal/match/dict_test.go @@ -5,11 +5,13 @@ package match import ( - "io/ioutil" + "os" "path/filepath" "reflect" "regexp" + "strings" "testing" + "time" ) func TestDict(t *testing.T) { @@ -18,7 +20,7 @@ func TestDict(t *testing.T) { indexes := []WordID{0, 1, 2, 1, 3, 0} var d Dict - for j := 0; j < 2; j++ { + for range 2 { for i, w := range words { id := d.Insert(w) if id != indexes[i] { @@ -100,6 +102,8 @@ var markdownAnchorSizeTests = []struct { {"{#abc def}", 0}, {"{#abc\ndef}", 0}, {"{#abc\rdef}", 0}, + {"{#abc{#def}", 0}, + {"{#" + strings.Repeat("a", 300) + "}", 0}, } func TestMarkdownAnchorSize(t *testing.T) { @@ -111,6 +115,25 @@ func TestMarkdownAnchorSize(t *testing.T) { } } +var markdownLinkSizeTests = []struct { + in string + out int +}{ + {"](http://abc)", 13}, + {"](#abc)", 7}, + {"](http://abc](http://def)", 0}, + {"](http://" + strings.Repeat("a", 3000) + ")", 0}, +} + +func TestMarkdownLinkSize(t *testing.T) { + for _, tt := range markdownLinkSizeTests { + out := markdownLinkSize(tt.in) + if out != tt.out { + t.Errorf("markdownLinkSize(%q) = %d want %d", tt.in, out, tt.out) + } + } +} + var insertSplitTests = []struct { in string out string @@ -245,6 +268,24 @@ func rot13(s string) string { return string(b) } +func TestSplitMarkdownQuadratic(t *testing.T) { + // Repeated trigger sequences without closers used to make + // markdownAnchorSize and markdownLinkSize scan to the end of the + // input on every call, giving O(n^2) total work. + inputs := []string{ + strings.Repeat("{#x", 100000), + strings.Repeat("](http://x", 50000), + } + var d Dict + for _, in := range inputs { + start := time.Now() + d.Split(in) + if elapsed := time.Since(start); elapsed > time.Second { + t.Errorf("Split(%d bytes %q...) took %v, want < 1s", len(in), in[:10], elapsed) + } + } +} + var bench struct { data []byte str string @@ -260,7 +301,7 @@ func benchSetup(b *testing.B) { b.Fatal(err) } for _, file := range files { - data, err := ioutil.ReadFile(file) + data, err := os.ReadFile(file) if err != nil { b.Fatal(err) } diff --git a/internal/match/regexp_test.go b/internal/match/regexp_test.go index b2e9bed..f6dfe72 100644 --- a/internal/match/regexp_test.go +++ b/internal/match/regexp_test.go @@ -27,7 +27,7 @@ func TestMultiLREMatch(t *testing.T) { for id, tt := range multiMatchTests { t.Run(fmt.Sprint(id), func(t *testing.T) { var list []*LRE - for _, expr := range strings.Split(tt.re, "/") { + for expr := range strings.SplitSeq(tt.re, "/") { re, err := ParseLRE(&d, "x", expr) if err != nil { t.Fatalf("Parse(%q): %v", expr, err) diff --git a/internal/match/rematch.go b/internal/match/rematch.go index 3dca29c..0719caa 100644 --- a/internal/match/rematch.go +++ b/internal/match/rematch.go @@ -200,6 +200,7 @@ package match import ( "encoding/binary" "fmt" + "slices" "sort" "strings" ) @@ -436,12 +437,7 @@ func (c *reCompile) mergeCut(cut1, cut2 []reCut) []reCut { func canMatchEmpty(re *reSyntax) bool { switch re.op { case opAlternate: - for _, sub := range re.sub { - if canMatchEmpty(sub) { - return true - } - } - return false + return slices.ContainsFunc(re.sub, canMatchEmpty) case opConcat: for _, sub := range re.sub { @@ -512,10 +508,8 @@ func (s *nfaState) add(prog reProg, pc int32) { // where we are in the list. If this ever showed up as expensive // on a profile, we could switch to a sparse set instead; // see https://research.swtch.com/sparse. - for _, old := range *s { - if old == pc { - return - } + if slices.Contains(*s, pc) { + return } *s = append(*s, pc) @@ -660,18 +654,17 @@ func (s nfaState) appendEncoding(enc []byte) []byte { // // The encoding of this state information is: // -// - a one-word header M | N<<1, where M is 0 for a non-match, 1 for a match, -// and N is the number of words in the table. -// This header is conveniently also the number of words that follow in the encoding. -// -// - if M == 1, a one-word value V that is the match value to report, -// identifying which of a set of regexps has been matched. +// - a one-word header M | N<<1, where M is 0 for a non-match, 1 for a match, +// and N is the number of words in the table. +// This header is conveniently also the number of words that follow in the encoding. // -// - N two-word pairs W:NEXT indicating that if word W is seen, the DFA should -// move to the state at offset NEXT. The pairs are sorted by W. An entry for W == AnyWord -// is treated as matching any input word; an exact match later in the list takes priority. -// The list is sorted by W, so AnyWord is always first if present. +// - if M == 1, a one-word value V that is the match value to report, +// identifying which of a set of regexps has been matched. // +// - N two-word pairs W:NEXT indicating that if word W is seen, the DFA should +// move to the state at offset NEXT. The pairs are sorted by W. An entry for W == AnyWord +// is treated as matching any input word; an exact match later in the list takes priority. +// The list is sorted by W, so AnyWord is always first if present. type reDFA []int32 // A dfaBuilder holds state for building a DFA from a reProg. @@ -784,7 +777,6 @@ func (dfa reDFA) string(d *Dict) string { // off = dnext // } // } -// func (dfa reDFA) stateAt(off int32) (match int32, delta []int32) { hdr := dfa[off] off++ @@ -908,10 +900,7 @@ Words: // the last time we saw a matching state), // print information about it. if TraceDFA > 0 && i-end >= TraceDFA { - start := i - 10 - if start < 0 { - start = 0 - } + start := max(i-10, 0) print("DFA mismatch at «", text[words[start].Lo:words[i].Lo], "|", text[words[i].Lo:words[i].Hi], "»\n") @@ -932,25 +921,18 @@ Words: end = len(words) } if i := len(words); TraceDFA > 0 && i-end >= TraceDFA { - start := i - 10 - if start < 0 { - start = 0 - } + start := max(i-10, 0) println("DFA ran out of input at «", text[words[start].Lo:], "|", "EOF", "»\n") } return match, end } func sortInt32s(x []int32) { - sort.Slice(x, func(i, j int) bool { - return x[i] < x[j] - }) + slices.Sort(x) } func sortWordIDs(x []WordID) { - sort.Slice(x, func(i, j int) bool { - return x[i] < x[j] - }) + slices.Sort(x) } // canMisspell reports whether want can be misspelled as have. diff --git a/internal/match/rematch_test.go b/internal/match/rematch_test.go index 6112593..2950ba8 100644 --- a/internal/match/rematch_test.go +++ b/internal/match/rematch_test.go @@ -337,10 +337,10 @@ The name __10__ may not be used. func TestCompile(t *testing.T) { var d Dict - for _, tt := range strings.Split(compileTests, "\n\n") { + for tt := range strings.SplitSeq(compileTests, "\n\n") { tt = strings.TrimSpace(tt) + "\n" - i := strings.Index(tt, "\n") - in, want := tt[:i], tt[i+1:] + before, after, _ := strings.Cut(tt, "\n") + in, want := before, after prog := testProg(t, &d, in) if prog == nil { @@ -356,7 +356,7 @@ func TestCompile(t *testing.T) { func testProg(t *testing.T, dict *Dict, expr string) reProg { if strings.Contains(expr, "/") { var list []reProg - for _, str := range strings.Split(expr, "/") { + for str := range strings.SplitSeq(expr, "/") { re, err := reParse(dict, str, false) if err != nil { t.Errorf("Parse(%q): %v", expr, err) @@ -567,10 +567,10 @@ The name __10__ may not be used func TestCompileDFA(t *testing.T) { var d Dict - for _, tt := range strings.Split(compileDFATests, "\n\n") { + for tt := range strings.SplitSeq(compileDFATests, "\n\n") { tt = strings.TrimSpace(tt) + "\n" - i := strings.Index(tt, "\n") - in, want := tt[:i], tt[i+1:] + before, after, _ := strings.Cut(tt, "\n") + in, want := before, after prog := testProg(t, &d, in) if prog == nil { diff --git a/internal/match/serialize.go b/internal/match/serialize.go new file mode 100644 index 0000000..0c795a3 --- /dev/null +++ b/internal/match/serialize.go @@ -0,0 +1,166 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package match + +import ( + "bytes" + "compress/flate" + "encoding/binary" + "errors" + "fmt" + "io" + "slices" +) + +const dfaMagic = "LREDFA02" + +// MarshalMultiLRE serializes a MultiLRE into a compressed binary format +// that can be loaded back with UnmarshalMultiLRE. +func MarshalMultiLRE(re *MultiLRE) []byte { + words := re.dict.Words() + + // Collect start phrases into a sorted slice for deterministic output. + var starts []phrase + for p := range re.start { + starts = append(starts, p) + } + sortPhrases(starts) + + // Build the uncompressed payload (everything after the header). + var payload bytes.Buffer + + // Dict words in insertion order. + for _, w := range words { + writeUint16(&payload, uint16(len(w))) + payload.WriteString(w) + } + + // DFA as raw int32 values. + for _, v := range re.dfa { + writeInt32(&payload, v) + } + + // Start phrases. + for _, p := range starts { + writeInt32(&payload, int32(p[0])) + writeInt32(&payload, int32(p[1])) + } + + // Compress the payload. + var out bytes.Buffer + out.WriteString(dfaMagic) + writeUint32Buf(&out, uint32(len(words))) + writeUint32Buf(&out, uint32(len(re.dfa))) + writeUint32Buf(&out, uint32(len(starts))) + + w, _ := flate.NewWriter(&out, flate.BestCompression) + w.Write(payload.Bytes()) + w.Close() + + return out.Bytes() +} + +// UnmarshalMultiLRE deserializes a MultiLRE from the compressed binary format +// produced by MarshalMultiLRE. +func UnmarshalMultiLRE(data []byte) (*MultiLRE, error) { + if len(data) < 20 { + return nil, errors.New("match: DFA data too short") + } + if string(data[:8]) != dfaMagic { + return nil, fmt.Errorf("match: bad DFA magic %q", data[:8]) + } + + dictLen := binary.LittleEndian.Uint32(data[8:12]) + dfaLen := binary.LittleEndian.Uint32(data[12:16]) + startLen := binary.LittleEndian.Uint32(data[16:20]) + + // Empty DFA (bootstrap/triv file). + if dictLen == 0 && dfaLen == 0 && startLen == 0 { + return &MultiLRE{dict: new(Dict), start: make(map[phrase]struct{})}, nil + } + + // Decompress the payload. + r := flate.NewReader(bytes.NewReader(data[20:])) + payload, err := io.ReadAll(r) + r.Close() + if err != nil { + return nil, fmt.Errorf("match: decompressing DFA: %v", err) + } + + off := 0 + + // Reconstruct Dict by inserting words in order. + d := new(Dict) + for i := uint32(0); i < dictLen; i++ { + if off+2 > len(payload) { + return nil, errors.New("match: DFA data truncated in dict") + } + wlen := int(binary.LittleEndian.Uint16(payload[off : off+2])) + off += 2 + if off+wlen > len(payload) { + return nil, errors.New("match: DFA data truncated in dict word") + } + d.Insert(string(payload[off : off+wlen])) + off += wlen + } + + // Read DFA. + dfaBytes := int(dfaLen) * 4 + if off+dfaBytes > len(payload) { + return nil, errors.New("match: DFA data truncated in DFA") + } + dfa := make(reDFA, dfaLen) + for i := range dfa { + dfa[i] = int32(binary.LittleEndian.Uint32(payload[off : off+4])) + off += 4 + } + + // Read start phrases. + startBytes := int(startLen) * 8 + if off+startBytes > len(payload) { + return nil, errors.New("match: DFA data truncated in start phrases") + } + start := make(map[phrase]struct{}, startLen) + for i := uint32(0); i < startLen; i++ { + var p phrase + p[0] = WordID(int32(binary.LittleEndian.Uint32(payload[off : off+4]))) + p[1] = WordID(int32(binary.LittleEndian.Uint32(payload[off+4 : off+8]))) + start[p] = struct{}{} + off += 8 + } + + return &MultiLRE{dict: d, dfa: dfa, start: start}, nil +} + +func writeUint16(buf *bytes.Buffer, v uint16) { + buf.Write([]byte{byte(v), byte(v >> 8)}) +} + +func writeUint32Buf(buf *bytes.Buffer, v uint32) { + buf.Write([]byte{byte(v), byte(v >> 8), byte(v >> 16), byte(v >> 24)}) +} + +func writeInt32(buf *bytes.Buffer, v int32) { + u := uint32(v) + buf.Write([]byte{byte(u), byte(u >> 8), byte(u >> 16), byte(u >> 24)}) +} + +func sortPhrases(ps []phrase) { + slices.SortFunc(ps, func(a, b phrase) int { + if a[0] != b[0] { + if a[0] < b[0] { + return -1 + } + return 1 + } + if a[1] != b[1] { + if a[1] < b[1] { + return -1 + } + return 1 + } + return 0 + }) +} diff --git a/internal/match/serialize_test.go b/internal/match/serialize_test.go new file mode 100644 index 0000000..c95aed6 --- /dev/null +++ b/internal/match/serialize_test.go @@ -0,0 +1,101 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package match + +import "testing" + +func TestMarshalUnmarshalRoundTrip(t *testing.T) { + d := new(Dict) + d.Insert("copyright") + d.Insert("http") + + lres := []struct { + name string + pat string + }{ + {"MIT", "permission is hereby granted free of charge"}, + {"BSD", "redistribution and use in source and binary forms"}, + } + + var list []*LRE + for _, l := range lres { + re, err := ParseLRE(d, l.name, l.pat) + if err != nil { + t.Fatalf("ParseLRE(%s): %v", l.name, err) + } + list = append(list, re) + } + + orig, err := NewMultiLRE(list) + if err != nil { + t.Fatal("NewMultiLRE:", err) + } + + data := MarshalMultiLRE(orig) + if len(data) < 20 { + t.Fatalf("MarshalMultiLRE returned %d bytes, want at least 20", len(data)) + } + if string(data[:8]) != dfaMagic { + t.Fatalf("bad magic: got %q, want %q", data[:8], dfaMagic) + } + + restored, err := UnmarshalMultiLRE(data) + if err != nil { + t.Fatal("UnmarshalMultiLRE:", err) + } + + // Verify dict words match. + origWords := orig.Dict().Words() + restoredWords := restored.Dict().Words() + if len(origWords) != len(restoredWords) { + t.Fatalf("dict length: got %d, want %d", len(restoredWords), len(origWords)) + } + for i := range origWords { + if origWords[i] != restoredWords[i] { + t.Errorf("dict[%d]: got %q, want %q", i, restoredWords[i], origWords[i]) + } + } + + // Verify match results are identical on test texts. + texts := []string{ + "Permission is hereby granted, free of charge, to any person", + "Redistribution and use in source and binary forms, with or without modification", + "This is not a license at all", + "", + } + for _, text := range texts { + om := orig.Match(text) + rm := restored.Match(text) + + if len(om.List) != len(rm.List) { + t.Errorf("Match(%q): got %d matches, want %d", text, len(rm.List), len(om.List)) + continue + } + for i := range om.List { + if om.List[i] != rm.List[i] { + t.Errorf("Match(%q)[%d]: got %+v, want %+v", text, i, rm.List[i], om.List[i]) + } + } + } +} + +func TestUnmarshalErrors(t *testing.T) { + tests := []struct { + name string + data []byte + }{ + {"too short", []byte("LREDFA0")}, + {"bad magic", []byte("BADMAGIC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")}, + {"truncated dict", append([]byte("LREDFA01"), []byte{1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}...)}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := UnmarshalMultiLRE(tt.data) + if err == nil { + t.Error("expected error, got nil") + } + }) + } +} diff --git a/license.go b/license.go index bc810e1..a5fbcc7 100644 --- a/license.go +++ b/license.go @@ -5,7 +5,7 @@ // Package licensecheck classifies license files and heuristically determines // how well they correspond to known open source licenses. // -// Scanning +// # Scanning // // A text (a slice of bytes) can be scanned for known licenses by calling Scan. // The resulting Coverage structure describes all the matches found as well @@ -28,7 +28,7 @@ // expressions (LREs). // BuiltinLicenses returns the set of license patterns used by Scan. // -// License Regular Expressions +// # License Regular Expressions // // Each license to be recognized is specified by writing a license regular // expression (LRE) for it. The pattern syntax and the matching are word-based and @@ -36,13 +36,13 @@ // // The valid LRE patterns are: // -// - word, a single case-insensitive word -// - __N__, any sequence of up to N words -// - expr1 expr2, concatenation of two expressions -// - expr1 || expr2, alternation of two expressions -// - (( expr )), grouping -// - (( expr ))??, zero or one instances of the grouped expression -// - //** text **//, a comment ignored by the parser +// - word, a single case-insensitive word +// - __N__, any sequence of up to N words +// - expr1 expr2, concatenation of two expressions +// - expr1 || expr2, alternation of two expressions +// - (( expr )), grouping +// - (( expr ))??, zero or one instances of the grouped expression +// - //** text **//, a comment ignored by the parser // // To make patterns harder to misread in large texts: // (( must only appear at the start of a line (possibly indented); @@ -51,21 +51,20 @@ // // For example: // -// //** https://en.wikipedia.org/wiki/Filler_text **// -// Now is -// ((not))?? -// the time for all good -// ((men || women || people)) -// to come to the aid of their __1__. +// //** https://en.wikipedia.org/wiki/Filler_text **// +// Now is +// ((not))?? +// the time for all good +// ((men || women || people)) +// to come to the aid of their __1__. // -// The old Cover and Checker API +// # The old Cover and Checker API // // An older, less precise matcher using the names Cover, New, and Checker // was removed from this package. // Use v0.1.0 for the final version of that API, // or use the copy in the package "old" underneath this one // for easier comparison with this API. -// package licensecheck import ( @@ -78,6 +77,7 @@ import ( // gen_data.go imports licensecheck for Type, so we copy over // a trivial data.gen.go in order to build gen_data.go during "go run". //go:generate cp data.gen.go.triv data.gen.go +//go:generate cp builtin.dfa.triv builtin.dfa //go:generate go run gen_data.go // A License describes a single license that can be recognized. @@ -210,17 +210,17 @@ func (t Type) String() string { if t == 0 { return "Unknown" } - s := "" + var s strings.Builder for _, b := range typeBits { if b.t != 0 && t&b.t == b.t { t &^= b.t - s += "|" + b.s + s.WriteString("|" + b.s) } } if t != 0 { - s += fmt.Sprintf("|Type(%#x)", uint(t)) + s.WriteString(fmt.Sprintf("|Type(%#x)", uint(t))) } - return s[1:] + return s.String()[1:] } // ParseType parses s into a Type. @@ -228,7 +228,7 @@ func (t Type) String() string { func ParseType(s string) (Type, error) { var t Type Fields: - for _, f := range strings.Split(s, "|") { + for f := range strings.SplitSeq(s, "|") { for _, b := range typeBits { if b.s == f { t |= b.t diff --git a/license_test.go b/license_test.go index 20f549e..f815cdb 100644 --- a/license_test.go +++ b/license_test.go @@ -8,7 +8,6 @@ import ( "bytes" "flag" "fmt" - "io/ioutil" "math" "os" "path/filepath" @@ -16,7 +15,7 @@ import ( "strings" "testing" - "github.com/google/licensecheck/internal/match" + "github.com/git-pkgs/licensecheck/internal/match" ) func init() { @@ -43,22 +42,21 @@ func TestTestdata(t *testing.T) { if !strings.Contains(file, ".t") { t.Errorf("unexpected file: %v", file) } - file := file t.Run(name, func(t *testing.T) { t.Parallel() // faster and tests for races in parallel usage - data, err := ioutil.ReadFile(file) + data, err := os.ReadFile(file) if err != nil { t.Fatal(err) } // See testdata/README for definition of test data file. // Header ends at blank line. - i := bytes.Index(data, []byte("\n\n")) - if i < 0 { + before, after, ok := bytes.Cut(data, []byte("\n\n")) + if !ok { t.Fatalf("%s: invalid test data file: no blank line terminating header", file) } - hdr, data := strings.Split(string(data[:i]), "\n"), data[i+2:] + hdr, data := strings.Split(string(before), "\n"), after lineno := 1 // Skip leading comment lines. @@ -181,19 +179,19 @@ func parsePercent(s string) (float64, error) { // parseRange parses a start,end range (two decimals separated by a comma). // As a special case, the second decimal can be $ meaning end-of-file. func parseRange(s string, end int) (int, int, error) { - i := strings.Index(s, ",") - if i < 0 { + before, after, ok := strings.Cut(s, ",") + if !ok { return 0, 0, fmt.Errorf("malformed range") } - lo, err := strconv.Atoi(s[:i]) + lo, err := strconv.Atoi(before) if err != nil { return 0, 0, err } var hi int - if s[i+1:] == "$" { + if after == "$" { hi = end } else { - hi, err = strconv.Atoi(s[i+1:]) + hi, err = strconv.Atoi(after) if err != nil { return 0, 0, err } @@ -230,7 +228,7 @@ func BenchmarkScanTestdata(b *testing.B) { if info, err := os.Stat(file); err == nil && info.IsDir() { continue } - data, err := ioutil.ReadFile(file) + data, err := os.ReadFile(file) if err != nil { b.Fatal(err) } @@ -250,7 +248,7 @@ func TestTrace(t *testing.T) { if *trace == "" { t.Skip("-tr not given") } - data, err := ioutil.ReadFile(*trace) + data, err := os.ReadFile(*trace) if err != nil { t.Fatal(err) } diff --git a/licenses/README.md b/licenses/README.md index 29cc62e..81dfa28 100644 --- a/licenses/README.md +++ b/licenses/README.md @@ -1,6 +1,6 @@ # Licensecheck: Built-In Licenses -This directory contains the definitions of the licenses built into [github.com/google/licensecheck](../README.md). +This directory contains the definitions of the licenses built into [github.com/git-pkgs/licensecheck](../README.md). To add new licenses, see the “[Adding new built-in licenses](#adding-new-built-in-licenses)” section below. It is a goal to incorporate the entire [SPDX license list and IDs](https://spdx.dev/licenses/) @@ -303,5 +303,5 @@ so that common pieces can be factored out After editing files in this directory, run `go generate` in the licensecheck (parent) directory. Note that when using -[licensecheck.NewScanner](https://pkg.go.dev/github.com/google/licensecheck/#NewScanner), +[licensecheck.NewScanner](https://pkg.go.dev/github.com/git-pkgs/licensecheck/#NewScanner), the input is plain LRE, not template text. diff --git a/old/license.go b/old/license.go index c53e53e..1c6a681 100644 --- a/old/license.go +++ b/old/license.go @@ -46,7 +46,7 @@ const ( ) func licenseType(name string) Type { - for l := Type(0); l < NumTypes; l++ { + for l := range NumTypes { if strings.HasPrefix(name, l.String()) { return l } @@ -192,7 +192,6 @@ func (c *Checker) updateIndex(id int32, words []int32) { // match a particular section of the input, the best match // is chosen so the returned coverage describes at most // one match for each section of the input. -// func Cover(input []byte, opts Options) (Coverage, bool) { return builtin.Cover(input, opts) } @@ -597,10 +596,7 @@ func (c *Checker) submatches(text []int32, opts Options) []submatch { // over multiple nearby blanks, such as in licenses/ISC. BlankLoop: for matchLicenseStart >= 2 && l.doc.words[matchLicenseStart-1] == blankID && l.doc.words[matchLicenseStart-2] != blankID { - min := start - blankMax - if min < 0 { - min = 0 - } + min := max(start-blankMax, 0) if i := byLicense[licenseID]; i >= 0 && min < matches[i].end { min = matches[i].end } diff --git a/old/license_test.go b/old/license_test.go index 436d107..a735d46 100644 --- a/old/license_test.go +++ b/old/license_test.go @@ -8,7 +8,6 @@ import ( "bytes" "flag" "fmt" - "io/ioutil" "math" "os" "path/filepath" @@ -16,7 +15,7 @@ import ( "strings" "testing" - "github.com/google/licensecheck/internal/match" + "github.com/git-pkgs/licensecheck/internal/match" ) func init() { @@ -43,22 +42,21 @@ func TestTestdata(t *testing.T) { if !strings.Contains(file, ".t") { t.Errorf("unexpected file: %v", file) } - file := file t.Run(name, func(t *testing.T) { t.Parallel() // faster and tests for races in parallel usage - data, err := ioutil.ReadFile(file) + data, err := os.ReadFile(file) if err != nil { t.Fatal(err) } // See testdata/README for definition of test data file. // Header ends at blank line. - i := bytes.Index(data, []byte("\n\n")) - if i < 0 { + before, after, ok := bytes.Cut(data, []byte("\n\n")) + if !ok { t.Fatalf("%s: invalid test data file: no blank line terminating header", file) } - hdr, data := strings.Split(string(data[:i]), "\n"), data[i+2:] + hdr, data := strings.Split(string(before), "\n"), after lineno := 1 // Skip leading comment lines. @@ -210,19 +208,19 @@ func parsePercent(s string) (float64, error) { // parseRange parses a start,end range (two decimals separated by a comma). // As a special case, the second decimal can be $ meaning end-of-file. func parseRange(s string, end int) (int, int, error) { - i := strings.Index(s, ",") - if i < 0 { + before, after, ok := strings.Cut(s, ",") + if !ok { return 0, 0, fmt.Errorf("malformed range") } - lo, err := strconv.Atoi(s[:i]) + lo, err := strconv.Atoi(before) if err != nil { return 0, 0, err } var hi int - if s[i+1:] == "$" { + if after == "$" { hi = end } else { - hi, err = strconv.Atoi(s[i+1:]) + hi, err = strconv.Atoi(after) if err != nil { return 0, 0, err } @@ -260,7 +258,7 @@ func BenchmarkTestdata(b *testing.B) { if info, err := os.Stat(file); err == nil && info.IsDir() { continue } - data, err := ioutil.ReadFile(file) + data, err := os.ReadFile(file) if err != nil { b.Fatal(err) } diff --git a/old/normalize.go b/old/normalize.go index 59dc637..5f4fecd 100644 --- a/old/normalize.go +++ b/old/normalize.go @@ -130,7 +130,7 @@ const maxListMarkerLength = 4 var listMarker = func() map[string]bool { const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv vi vii viii ix xi xii xiii xiv xv" l := map[string]bool{} - for _, marker := range strings.Split(allListMarkers, " ") { + for marker := range strings.SplitSeq(allListMarkers, " ") { if len(marker) > maxListMarkerLength { panic("marker too long") } diff --git a/scan.go b/scan.go index 108e12a..a027f05 100644 --- a/scan.go +++ b/scan.go @@ -6,20 +6,21 @@ package licensecheck import ( "bytes" + _ "embed" "errors" "fmt" "regexp" "strings" "sync" - "github.com/google/licensecheck/internal/match" + "github.com/git-pkgs/licensecheck/internal/match" ) +//go:embed builtin.dfa +var builtinDFAData []byte + var ( - // builtinScanner is initialized lazily, - // because init is fairly expensive, - // and delaying it lets us see the init - // in test cpu profiles. + // builtinScanner is initialized lazily. builtinScanner = new(Scanner) builtinScannerOnce sync.Once ) @@ -110,7 +111,6 @@ const maxCopyrightWords = 50 // disjoint matches. If multiple licenses match a particular section of the input, // the earliest match is chosen so the returned coverage describes at most one // match for each section of the input. -// func Scan(text []byte) Coverage { return builtinScanner.Scan(text) } @@ -122,8 +122,19 @@ var urlScanRE = regexp.MustCompile(`^(?i)https?://[-a-z0-9_.]+\.(org|com)(/[-a-z func (s *Scanner) Scan(text []byte) Coverage { if s == builtinScanner { builtinScannerOnce.Do(func() { - if err := builtinScanner.init(BuiltinLicenses()); err != nil { - panic("licensecheck: initializing Scan: " + err.Error()) + re, err := match.UnmarshalMultiLRE(builtinDFAData) + if err != nil { + panic("licensecheck: loading builtin DFA: " + err.Error()) + } + builtinScanner.re = re + builtinScanner.urls = make(map[string]License) + for _, l := range BuiltinLicenses() { + if l.URL != "" { + builtinScanner.urls[l.URL] = l + } + if l.LRE != "" { + builtinScanner.licenses = append(builtinScanner.licenses, l) + } } }) } @@ -142,10 +153,7 @@ func (s *Scanner) Scan(text []byte) Coverage { for _, m := range matches.List { if m.Start < len(words) && lastEnd < m.Start && copyright >= 0 { - limit := m.Start - maxCopyrightWords - if limit < lastEnd { - limit = lastEnd - } + limit := max(m.Start-maxCopyrightWords, lastEnd) for i := limit; i < m.Start; i++ { if words[i].ID == copyright { m.Start = i diff --git a/type_test.go b/type_test.go index fc2e8ff..572e0dc 100644 --- a/type_test.go +++ b/type_test.go @@ -15,7 +15,7 @@ func TestTypeString(t *testing.T) { } numError := 0 - for typ := Type(0); typ < Discouraged+100; typ++ { + for typ := range Discouraged + 100 { s := typ.String() ptyp, err := ParseType(s) if err != nil {