From 0e69cc911977664e4bc01eb1370a2810d923e1a6 Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 13:30:06 +0600 Subject: [PATCH 1/9] refactor(crawler): all lastUpdate field and check link date --- cmd/trivy-java-db/main.go | 22 +++++++++++++++++---- pkg/crawler/crawler.go | 40 +++++++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index 889c3ed..8c47a46 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -6,6 +6,7 @@ import ( "log/slog" "os" "path/filepath" + "time" "github.com/spf13/cobra" "golang.org/x/xerrors" @@ -26,8 +27,9 @@ func main() { var ( // Used for flags. - cacheDir string - limit int + cacheDir string + limit int + lastUpdate string rootCmd = &cobra.Command{ Use: "trivy-java-db", @@ -58,6 +60,7 @@ func init() { rootCmd.PersistentFlags().StringVar(&cacheDir, "cache-dir", filepath.Join(userCacheDir, "trivy-java-db"), "cache dir") rootCmd.PersistentFlags().IntVar(&limit, "limit", 300, "max parallelism") + rootCmd.PersistentFlags().StringVar(&lastUpdate, "last-update", "", "last update date in `YYYY-MM-DD` format") rootCmd.AddCommand(crawlCmd) rootCmd.AddCommand(buildCmd) @@ -66,10 +69,21 @@ func init() { } func crawl(ctx context.Context) error { - c := crawler.NewCrawler(crawler.Option{ + opt := crawler.Option{ Limit: int64(limit), CacheDir: cacheDir, - }) + } + + if lastUpdate != "" { + t, err := time.Parse("2006-01-02", lastUpdate) + if err != nil { + return xerrors.Errorf("incorrect last update date format: %w", err) + } + opt.LastUpdate = t + } + + c := crawler.NewCrawler(opt) + if err := c.Crawl(ctx); err != nil { return xerrors.Errorf("crawl error: %w", err) } diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 65f2a4c..192e48e 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -28,8 +28,9 @@ import ( const mavenRepoURL = "https://repo.maven.apache.org/maven2/" type Crawler struct { - dir string - http *retryablehttp.Client + dir string + http *retryablehttp.Client + lastUpdate time.Time rootUrl string wg sync.WaitGroup @@ -39,9 +40,10 @@ type Crawler struct { } type Option struct { - Limit int64 - RootUrl string - CacheDir string + Limit int64 + RootUrl string + CacheDir string + LastUpdate time.Time } func NewCrawler(opt Option) Crawler { @@ -78,8 +80,9 @@ func NewCrawler(opt Option) Crawler { slog.Info("Index dir", slog.String("path", indexDir)) return Crawler{ - dir: indexDir, - http: client, + dir: indexDir, + http: client, + lastUpdate: opt.LastUpdate, rootUrl: opt.RootUrl, urlCh: make(chan string, opt.Limit*10), @@ -187,7 +190,10 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { // only `../` and dirs have `/` suffix. We don't need to check other files. return } - children = append(children, link) + if !c.skipChildLink(selection) { + children = append(children, link) + } + }) if foundMetadata { @@ -221,6 +227,24 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { return nil } +func (c *Crawler) skipChildLink(selection *goquery.Selection) bool { + if c.lastUpdate.IsZero() { + return false + } + + fields := strings.Fields(selection.Get(0).NextSibling.Data) + if len(fields) == 0 || fields[0] == "-" { + return false + } + linkTime, err := time.Parse("2006-01-02", fields[0]) + if err != nil { + slog.Warn("Unable to parse link time", slog.String("time", fields[0])) + return false + } + + return linkTime.Before(c.lastUpdate) +} + func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error { var foundVersions []Version // Check each version dir to find links to `*.jar.sha1` files. From 17632150518b1d36060fca1ed05002e651db46af Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 13:30:14 +0600 Subject: [PATCH 2/9] test(crawler): add test case --- pkg/crawler/crawler_test.go | 25 ++++++++++++++++--- .../testdata/happy/abbot-1.4.0.json.golden | 15 +++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 pkg/crawler/testdata/happy/abbot-1.4.0.json.golden diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go index 69a004c..bd55ed7 100644 --- a/pkg/crawler/crawler_test.go +++ b/pkg/crawler/crawler_test.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" @@ -17,6 +18,7 @@ func TestCrawl(t *testing.T) { tests := []struct { name string limit int64 + lastUpdate time.Time fileNames map[string]string goldenPath string filePath string @@ -42,6 +44,22 @@ func TestCrawl(t *testing.T) { goldenPath: "testdata/happy/abbot.json.golden", filePath: "indexes/abbot/abbot.json", }, + { + name: "happy path with lastUpdate", + limit: 1, + lastUpdate: time.Date(2010, 01, 01, 01, 01, 01, 0, time.UTC), + fileNames: map[string]string{ + "/maven2/": "testdata/happy/index.html", + "/maven2/abbot/": "testdata/happy/abbot.html", + "/maven2/abbot/abbot/": "testdata/happy/abbot_abbot.html", + "/maven2/abbot/abbot/maven-metadata.xml": "testdata/happy/maven-metadata.xml", + "/maven2/abbot/abbot/1.4.0/": "testdata/happy/abbot_abbot_1.4.0.html", + "/maven2/abbot/abbot/1.4.0/abbot-1.4.0.jar.sha1": "testdata/happy/abbot-1.4.0.jar.sha1", + "/maven2/abbot/abbot/1.4.0/abbot-1.4.0-lite.jar.sha1": "testdata/happy/abbot-1.4.0-lite.jar.sha1", + }, + goldenPath: "testdata/happy/abbot-1.4.0.json.golden", + filePath: "indexes/abbot/abbot.json", + }, { name: "sad path", limit: 2, @@ -77,9 +95,10 @@ func TestCrawl(t *testing.T) { tmpDir := t.TempDir() cl := crawler.NewCrawler(crawler.Option{ - RootUrl: ts.URL + "/maven2/", - Limit: tt.limit, - CacheDir: tmpDir, + RootUrl: ts.URL + "/maven2/", + Limit: tt.limit, + CacheDir: tmpDir, + LastUpdate: tt.lastUpdate, }) err := cl.Crawl(context.Background()) diff --git a/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden b/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden new file mode 100644 index 0000000..30e0a85 --- /dev/null +++ b/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden @@ -0,0 +1,15 @@ +{ + "GroupID": "abbot", + "ArtifactID": "abbot", + "Versions": [ + { + "Version": "1.4.0-lite", + "SHA1": "BUerA3Bor6ICaSW9lL+5/Pzsl2E=" + }, + { + "Version": "1.4.0", + "SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM=" + } + ], + "ArchiveType": "jar" +} \ No newline at end of file From dab7ba58d984039d258ddc3bbe338c8d5ebb3d7d Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 13:35:30 +0600 Subject: [PATCH 3/9] chore(crawler): add comment --- pkg/crawler/crawler.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 192e48e..4401495 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -227,6 +227,9 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { return nil } +// To avoid a large number of requests to the server, we should skip already saved artifacts (if the start date is specified). +// P.S. We do not need to check for updates, since artifacts are immutable +// see https://central.sonatype.org/publish/requirements/immutability func (c *Crawler) skipChildLink(selection *goquery.Selection) bool { if c.lastUpdate.IsZero() { return false From 2ca82ee4174617860ddf854cb350e9c0e19999c9 Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 14:22:21 +0600 Subject: [PATCH 4/9] refactor: use lastUpdate from metadata file --- cmd/trivy-java-db/main.go | 26 ++++++++++++-------------- pkg/db/db.go | 10 ++++++++-- pkg/db/metadata.go | 15 ++++++++++++++- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index 8c47a46..cf59334 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -6,7 +6,6 @@ import ( "log/slog" "os" "path/filepath" - "time" "github.com/spf13/cobra" "golang.org/x/xerrors" @@ -27,9 +26,8 @@ func main() { var ( // Used for flags. - cacheDir string - limit int - lastUpdate string + cacheDir string + limit int rootCmd = &cobra.Command{ Use: "trivy-java-db", @@ -60,7 +58,6 @@ func init() { rootCmd.PersistentFlags().StringVar(&cacheDir, "cache-dir", filepath.Join(userCacheDir, "trivy-java-db"), "cache dir") rootCmd.PersistentFlags().IntVar(&limit, "limit", 300, "max parallelism") - rootCmd.PersistentFlags().StringVar(&lastUpdate, "last-update", "", "last update date in `YYYY-MM-DD` format") rootCmd.AddCommand(crawlCmd) rootCmd.AddCommand(buildCmd) @@ -74,12 +71,13 @@ func crawl(ctx context.Context) error { CacheDir: cacheDir, } - if lastUpdate != "" { - t, err := time.Parse("2006-01-02", lastUpdate) + if db.Exists(cacheDir) { + t, err := db.GetMetadataUpdatedAt(cacheDir) if err != nil { - return xerrors.Errorf("incorrect last update date format: %w", err) + return xerrors.Errorf("unable to get metadata UpdatedAt time: %w", err) } - opt.LastUpdate = t + // Decrease the date by one day to offset the time of database creation + opt.LastUpdate = t.AddDate(0, 0, -1) } c := crawler.NewCrawler(opt) @@ -91,18 +89,18 @@ func crawl(ctx context.Context) error { } func build() error { - if err := db.Reset(cacheDir); err != nil { - return xerrors.Errorf("db reset error: %w", err) - } dbDir := filepath.Join(cacheDir, "db") slog.Info("Database", slog.String("path", dbDir)) dbc, err := db.New(dbDir) if err != nil { return xerrors.Errorf("db create error: %w", err) } - if err = dbc.Init(); err != nil { - return xerrors.Errorf("db init error: %w", err) + if !db.Exists(dbDir) { + if err = dbc.Init(); err != nil { + return xerrors.Errorf("db init error: %w", err) + } } + meta := db.NewMetadata(dbDir) b := builder.NewBuilder(dbc, meta) if err = b.Build(cacheDir); err != nil { diff --git a/pkg/db/db.go b/pkg/db/db.go index ea40ce2..f6ba4db 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -27,8 +27,14 @@ func path(cacheDir string) string { return filepath.Join(cacheDir, dbFileName) } -func Reset(cacheDir string) error { - return os.RemoveAll(path(cacheDir)) +func Exists(cacheDir string) bool { + if _, err := os.Stat(path(cacheDir)); os.IsNotExist(err) { + return false + } + if _, err := os.Stat(metadataPath(cacheDir)); os.IsNotExist(err) { + return false + } + return true } func New(cacheDir string) (DB, error) { diff --git a/pkg/db/metadata.go b/pkg/db/metadata.go index 6ef262c..df76d74 100644 --- a/pkg/db/metadata.go +++ b/pkg/db/metadata.go @@ -15,6 +15,10 @@ type Client struct { path string } +func metadataPath(cacheDir string) string { + return filepath.Join(cacheDir, metadataFile) +} + type Metadata struct { Version int `json:",omitempty"` NextUpdate time.Time @@ -24,7 +28,7 @@ type Metadata struct { func NewMetadata(cacheDir string) Client { return Client{ - path: filepath.Join(cacheDir, metadataFile), + path: metadataPath(cacheDir), } } @@ -67,3 +71,12 @@ func (c *Client) Delete() error { } return nil } + +func GetMetadataUpdatedAt(cacheDir string) (time.Time, error) { + c := NewMetadata(cacheDir) + metadata, err := c.Get() + if err != nil { + return time.Time{}, xerrors.Errorf("unable to get metadata: %w", err) + } + return metadata.UpdatedAt, nil +} From 0a4e567d02215cc9a43e9edcd1e6b2950b01a09b Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 14:22:39 +0600 Subject: [PATCH 5/9] ci: pull trivy-java-db in cron workflow --- .github/workflows/cron.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index f9d599a..ca39048 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -24,6 +24,17 @@ jobs: go-version-file: go.mod id: go + - name: Install oras + run: | + curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz + tar -xvf ./oras_1.2.0_linux_amd64.tar.gz + + - name: Pull trivy-java-db + run: | + mkdir -p ./cache/db + ./oras pull docker.io/aquasec/trivy-java-db:1 + tar -xvf javadb.tar.gz -C ./cache/db + - name: Build the binary run: make build @@ -59,11 +70,6 @@ jobs: username: ${{ secrets.ECR_ACCESS_KEY_ID }} password: ${{ secrets.ECR_SECRET_ACCESS_KEY }} - - name: Install oras - run: | - curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz - tar -xvf ./oras_1.2.0_linux_amd64.tar.gz - - name: Upload assets to registries run: | lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') From 934684405d2e20622afb0b9fc853983e8df9bb36 Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 15:09:38 +0600 Subject: [PATCH 6/9] fix: path to db --- cmd/trivy-java-db/main.go | 7 ++++--- pkg/db/db.go | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index cf59334..08e2c3e 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -71,8 +71,9 @@ func crawl(ctx context.Context) error { CacheDir: cacheDir, } - if db.Exists(cacheDir) { - t, err := db.GetMetadataUpdatedAt(cacheDir) + dbDir := db.Dir(cacheDir) + if db.Exists(dbDir) { + t, err := db.GetMetadataUpdatedAt(dbDir) if err != nil { return xerrors.Errorf("unable to get metadata UpdatedAt time: %w", err) } @@ -89,7 +90,7 @@ func crawl(ctx context.Context) error { } func build() error { - dbDir := filepath.Join(cacheDir, "db") + dbDir := db.Dir(cacheDir) slog.Info("Database", slog.String("path", dbDir)) dbc, err := db.New(dbDir) if err != nil { diff --git a/pkg/db/db.go b/pkg/db/db.go index f6ba4db..4e3d92d 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -27,6 +27,10 @@ func path(cacheDir string) string { return filepath.Join(cacheDir, dbFileName) } +func Dir(cacheDir string) string { + return filepath.Join(cacheDir, "db") +} + func Exists(cacheDir string) bool { if _, err := os.Stat(path(cacheDir)); os.IsNotExist(err) { return false From 98e4e9d75dbff1b27e365fcbf6513b9b7c6f7e6f Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 15:28:43 +0600 Subject: [PATCH 7/9] fix: use skipChildLink for sha1 links --- pkg/crawler/crawler.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 4401495..c9f41f0 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -336,7 +336,11 @@ func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) { // Don't include sources, test, javadocs, scaladoc files if strings.HasSuffix(link, ".jar.sha1") && !strings.HasSuffix(link, "sources.jar.sha1") && !strings.HasSuffix(link, "test.jar.sha1") && !strings.HasSuffix(link, "tests.jar.sha1") && - !strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") { + !strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") && + // There are cases when version dir doesn't have date + // So we should check date of sha1 file + // e.g. https://repo.maven.apache.org/maven2/ant-contrib/cpptasks/1.0b3/cpptasks-1.0b3.jar.sha1 + !c.skipChildLink(selection) { sha1URLs = append(sha1URLs, url+link) } }) From 0ce386e0805919de726ab34d9cba0f94a9b52741 Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 15:54:54 +0600 Subject: [PATCH 8/9] chore: add log --- cmd/trivy-java-db/main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index 08e2c3e..3c0b06a 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -79,6 +79,8 @@ func crawl(ctx context.Context) error { } // Decrease the date by one day to offset the time of database creation opt.LastUpdate = t.AddDate(0, 0, -1) + slog.Info("Using 'UpdatedAt' field to skip already added artifacts", + slog.String("date", fmt.Sprintf("%d-%d-%d", opt.LastUpdate.Year(), opt.LastUpdate.Month(), opt.LastUpdate.Day()))) } c := crawler.NewCrawler(opt) From 2f83ae720cca88cf7f2ace6b5b270b74860ca0ae Mon Sep 17 00:00:00 2001 From: DmitriyLewen Date: Mon, 23 Dec 2024 16:42:25 +0600 Subject: [PATCH 9/9] ci: use ghcr + lowercase_repo --- .github/workflows/cron.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index ca39048..38faa52 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -32,7 +32,8 @@ jobs: - name: Pull trivy-java-db run: | mkdir -p ./cache/db - ./oras pull docker.io/aquasec/trivy-java-db:1 + lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + ./oras pull "ghcr.io/${lowercase_repo}:${DB_VERSION}" tar -xvf javadb.tar.gz -C ./cache/db - name: Build the binary