diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index f9d599a..38faa52 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -24,6 +24,18 @@ jobs: go-version-file: go.mod id: go + - name: Install oras + run: | + curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz + tar -xvf ./oras_1.2.0_linux_amd64.tar.gz + + - name: Pull trivy-java-db + run: | + mkdir -p ./cache/db + lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + ./oras pull "ghcr.io/${lowercase_repo}:${DB_VERSION}" + tar -xvf javadb.tar.gz -C ./cache/db + - name: Build the binary run: make build @@ -59,11 +71,6 @@ jobs: username: ${{ secrets.ECR_ACCESS_KEY_ID }} password: ${{ secrets.ECR_SECRET_ACCESS_KEY }} - - name: Install oras - run: | - curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz - tar -xvf ./oras_1.2.0_linux_amd64.tar.gz - - name: Upload assets to registries run: | lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index 889c3ed..3c0b06a 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -66,10 +66,25 @@ func init() { } func crawl(ctx context.Context) error { - c := crawler.NewCrawler(crawler.Option{ + opt := crawler.Option{ Limit: int64(limit), CacheDir: cacheDir, - }) + } + + dbDir := db.Dir(cacheDir) + if db.Exists(dbDir) { + t, err := db.GetMetadataUpdatedAt(dbDir) + if err != nil { + return xerrors.Errorf("unable to get metadata UpdatedAt time: %w", err) + } + // Decrease the date by one day to offset the time of database creation + opt.LastUpdate = t.AddDate(0, 0, -1) + slog.Info("Using 'UpdatedAt' field to skip already added artifacts", + slog.String("date", fmt.Sprintf("%d-%d-%d", opt.LastUpdate.Year(), opt.LastUpdate.Month(), opt.LastUpdate.Day()))) + } + + c := crawler.NewCrawler(opt) + if err := c.Crawl(ctx); err != nil { return xerrors.Errorf("crawl error: %w", err) } @@ -77,18 +92,18 @@ func crawl(ctx context.Context) error { } func build() error { - if err := db.Reset(cacheDir); err != nil { - return xerrors.Errorf("db reset error: %w", err) - } - dbDir := filepath.Join(cacheDir, "db") + dbDir := db.Dir(cacheDir) slog.Info("Database", slog.String("path", dbDir)) dbc, err := db.New(dbDir) if err != nil { return xerrors.Errorf("db create error: %w", err) } - if err = dbc.Init(); err != nil { - return xerrors.Errorf("db init error: %w", err) + if !db.Exists(dbDir) { + if err = dbc.Init(); err != nil { + return xerrors.Errorf("db init error: %w", err) + } } + meta := db.NewMetadata(dbDir) b := builder.NewBuilder(dbc, meta) if err = b.Build(cacheDir); err != nil { diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 65f2a4c..c9f41f0 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -28,8 +28,9 @@ import ( const mavenRepoURL = "https://repo.maven.apache.org/maven2/" type Crawler struct { - dir string - http *retryablehttp.Client + dir string + http *retryablehttp.Client + lastUpdate time.Time rootUrl string wg sync.WaitGroup @@ -39,9 +40,10 @@ type Crawler struct { } type Option struct { - Limit int64 - RootUrl string - CacheDir string + Limit int64 + RootUrl string + CacheDir string + LastUpdate time.Time } func NewCrawler(opt Option) Crawler { @@ -78,8 +80,9 @@ func NewCrawler(opt Option) Crawler { slog.Info("Index dir", slog.String("path", indexDir)) return Crawler{ - dir: indexDir, - http: client, + dir: indexDir, + http: client, + lastUpdate: opt.LastUpdate, rootUrl: opt.RootUrl, urlCh: make(chan string, opt.Limit*10), @@ -187,7 +190,10 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { // only `../` and dirs have `/` suffix. We don't need to check other files. return } - children = append(children, link) + if !c.skipChildLink(selection) { + children = append(children, link) + } + }) if foundMetadata { @@ -221,6 +227,27 @@ func (c *Crawler) Visit(ctx context.Context, url string) error { return nil } +// To avoid a large number of requests to the server, we should skip already saved artifacts (if the start date is specified). +// P.S. We do not need to check for updates, since artifacts are immutable +// see https://central.sonatype.org/publish/requirements/immutability +func (c *Crawler) skipChildLink(selection *goquery.Selection) bool { + if c.lastUpdate.IsZero() { + return false + } + + fields := strings.Fields(selection.Get(0).NextSibling.Data) + if len(fields) == 0 || fields[0] == "-" { + return false + } + linkTime, err := time.Parse("2006-01-02", fields[0]) + if err != nil { + slog.Warn("Unable to parse link time", slog.String("time", fields[0])) + return false + } + + return linkTime.Before(c.lastUpdate) +} + func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error { var foundVersions []Version // Check each version dir to find links to `*.jar.sha1` files. @@ -309,7 +336,11 @@ func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) { // Don't include sources, test, javadocs, scaladoc files if strings.HasSuffix(link, ".jar.sha1") && !strings.HasSuffix(link, "sources.jar.sha1") && !strings.HasSuffix(link, "test.jar.sha1") && !strings.HasSuffix(link, "tests.jar.sha1") && - !strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") { + !strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") && + // There are cases when version dir doesn't have date + // So we should check date of sha1 file + // e.g. https://repo.maven.apache.org/maven2/ant-contrib/cpptasks/1.0b3/cpptasks-1.0b3.jar.sha1 + !c.skipChildLink(selection) { sha1URLs = append(sha1URLs, url+link) } }) diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go index 69a004c..bd55ed7 100644 --- a/pkg/crawler/crawler_test.go +++ b/pkg/crawler/crawler_test.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" @@ -17,6 +18,7 @@ func TestCrawl(t *testing.T) { tests := []struct { name string limit int64 + lastUpdate time.Time fileNames map[string]string goldenPath string filePath string @@ -42,6 +44,22 @@ func TestCrawl(t *testing.T) { goldenPath: "testdata/happy/abbot.json.golden", filePath: "indexes/abbot/abbot.json", }, + { + name: "happy path with lastUpdate", + limit: 1, + lastUpdate: time.Date(2010, 01, 01, 01, 01, 01, 0, time.UTC), + fileNames: map[string]string{ + "/maven2/": "testdata/happy/index.html", + "/maven2/abbot/": "testdata/happy/abbot.html", + "/maven2/abbot/abbot/": "testdata/happy/abbot_abbot.html", + "/maven2/abbot/abbot/maven-metadata.xml": "testdata/happy/maven-metadata.xml", + "/maven2/abbot/abbot/1.4.0/": "testdata/happy/abbot_abbot_1.4.0.html", + "/maven2/abbot/abbot/1.4.0/abbot-1.4.0.jar.sha1": "testdata/happy/abbot-1.4.0.jar.sha1", + "/maven2/abbot/abbot/1.4.0/abbot-1.4.0-lite.jar.sha1": "testdata/happy/abbot-1.4.0-lite.jar.sha1", + }, + goldenPath: "testdata/happy/abbot-1.4.0.json.golden", + filePath: "indexes/abbot/abbot.json", + }, { name: "sad path", limit: 2, @@ -77,9 +95,10 @@ func TestCrawl(t *testing.T) { tmpDir := t.TempDir() cl := crawler.NewCrawler(crawler.Option{ - RootUrl: ts.URL + "/maven2/", - Limit: tt.limit, - CacheDir: tmpDir, + RootUrl: ts.URL + "/maven2/", + Limit: tt.limit, + CacheDir: tmpDir, + LastUpdate: tt.lastUpdate, }) err := cl.Crawl(context.Background()) diff --git a/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden b/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden new file mode 100644 index 0000000..30e0a85 --- /dev/null +++ b/pkg/crawler/testdata/happy/abbot-1.4.0.json.golden @@ -0,0 +1,15 @@ +{ + "GroupID": "abbot", + "ArtifactID": "abbot", + "Versions": [ + { + "Version": "1.4.0-lite", + "SHA1": "BUerA3Bor6ICaSW9lL+5/Pzsl2E=" + }, + { + "Version": "1.4.0", + "SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM=" + } + ], + "ArchiveType": "jar" +} \ No newline at end of file diff --git a/pkg/db/db.go b/pkg/db/db.go index ea40ce2..4e3d92d 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -27,8 +27,18 @@ func path(cacheDir string) string { return filepath.Join(cacheDir, dbFileName) } -func Reset(cacheDir string) error { - return os.RemoveAll(path(cacheDir)) +func Dir(cacheDir string) string { + return filepath.Join(cacheDir, "db") +} + +func Exists(cacheDir string) bool { + if _, err := os.Stat(path(cacheDir)); os.IsNotExist(err) { + return false + } + if _, err := os.Stat(metadataPath(cacheDir)); os.IsNotExist(err) { + return false + } + return true } func New(cacheDir string) (DB, error) { diff --git a/pkg/db/metadata.go b/pkg/db/metadata.go index 6ef262c..df76d74 100644 --- a/pkg/db/metadata.go +++ b/pkg/db/metadata.go @@ -15,6 +15,10 @@ type Client struct { path string } +func metadataPath(cacheDir string) string { + return filepath.Join(cacheDir, metadataFile) +} + type Metadata struct { Version int `json:",omitempty"` NextUpdate time.Time @@ -24,7 +28,7 @@ type Metadata struct { func NewMetadata(cacheDir string) Client { return Client{ - path: filepath.Join(cacheDir, metadataFile), + path: metadataPath(cacheDir), } } @@ -67,3 +71,12 @@ func (c *Client) Delete() error { } return nil } + +func GetMetadataUpdatedAt(cacheDir string) (time.Time, error) { + c := NewMetadata(cacheDir) + metadata, err := c.Get() + if err != nil { + return time.Time{}, xerrors.Errorf("unable to get metadata: %w", err) + } + return metadata.UpdatedAt, nil +}