Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: add only new artifacts #48

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ jobs:
go-version-file: go.mod
id: go

- name: Install oras
run: |
curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz
tar -xvf ./oras_1.2.0_linux_amd64.tar.gz

- name: Pull trivy-java-db
run: |
mkdir -p ./cache/db
lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
./oras pull "ghcr.io/${lowercase_repo}:${DB_VERSION}"
tar -xvf javadb.tar.gz -C ./cache/db

- name: Build the binary
run: make build

Expand Down Expand Up @@ -59,11 +71,6 @@ jobs:
username: ${{ secrets.ECR_ACCESS_KEY_ID }}
password: ${{ secrets.ECR_SECRET_ACCESS_KEY }}

- name: Install oras
run: |
curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz
tar -xvf ./oras_1.2.0_linux_amd64.tar.gz

- name: Upload assets to registries
run: |
lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
Expand Down
31 changes: 23 additions & 8 deletions cmd/trivy-java-db/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,29 +66,44 @@ func init() {
}

func crawl(ctx context.Context) error {
c := crawler.NewCrawler(crawler.Option{
opt := crawler.Option{
Limit: int64(limit),
CacheDir: cacheDir,
})
}

dbDir := db.Dir(cacheDir)
if db.Exists(dbDir) {
t, err := db.GetMetadataUpdatedAt(dbDir)
if err != nil {
return xerrors.Errorf("unable to get metadata UpdatedAt time: %w", err)
}
// Decrease the date by one day to offset the time of database creation
opt.LastUpdate = t.AddDate(0, 0, -1)
slog.Info("Using 'UpdatedAt' field to skip already added artifacts",
slog.String("date", fmt.Sprintf("%d-%d-%d", opt.LastUpdate.Year(), opt.LastUpdate.Month(), opt.LastUpdate.Day())))
}

c := crawler.NewCrawler(opt)

if err := c.Crawl(ctx); err != nil {
return xerrors.Errorf("crawl error: %w", err)
}
return nil
}

func build() error {
if err := db.Reset(cacheDir); err != nil {
return xerrors.Errorf("db reset error: %w", err)
}
dbDir := filepath.Join(cacheDir, "db")
dbDir := db.Dir(cacheDir)
slog.Info("Database", slog.String("path", dbDir))
dbc, err := db.New(dbDir)
if err != nil {
return xerrors.Errorf("db create error: %w", err)
}
if err = dbc.Init(); err != nil {
return xerrors.Errorf("db init error: %w", err)
if !db.Exists(dbDir) {
if err = dbc.Init(); err != nil {
return xerrors.Errorf("db init error: %w", err)
}
}

meta := db.NewMetadata(dbDir)
b := builder.NewBuilder(dbc, meta)
if err = b.Build(cacheDir); err != nil {
Expand Down
49 changes: 40 additions & 9 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ import (
const mavenRepoURL = "https://repo.maven.apache.org/maven2/"

type Crawler struct {
dir string
http *retryablehttp.Client
dir string
http *retryablehttp.Client
lastUpdate time.Time

rootUrl string
wg sync.WaitGroup
Expand All @@ -39,9 +40,10 @@ type Crawler struct {
}

type Option struct {
Limit int64
RootUrl string
CacheDir string
Limit int64
RootUrl string
CacheDir string
LastUpdate time.Time
}

func NewCrawler(opt Option) Crawler {
Expand Down Expand Up @@ -78,8 +80,9 @@ func NewCrawler(opt Option) Crawler {
slog.Info("Index dir", slog.String("path", indexDir))

return Crawler{
dir: indexDir,
http: client,
dir: indexDir,
http: client,
lastUpdate: opt.LastUpdate,

rootUrl: opt.RootUrl,
urlCh: make(chan string, opt.Limit*10),
Expand Down Expand Up @@ -187,7 +190,10 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
// only `../` and dirs have `/` suffix. We don't need to check other files.
return
}
children = append(children, link)
if !c.skipChildLink(selection) {
children = append(children, link)
}

})

if foundMetadata {
Expand Down Expand Up @@ -221,6 +227,27 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
return nil
}

// To avoid a large number of requests to the server, we should skip already saved artifacts (if the start date is specified).
// P.S. We do not need to check for updates, since artifacts are immutable
// see https://central.sonatype.org/publish/requirements/immutability
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The document seems relevant to the sonatype repository. Is this repository also immutable?
https://mvnrepository.com/repos/central

Copy link
Collaborator Author

@DmitriyLewen DmitriyLewen Dec 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't find official info about that. But i think rules should be same.
But i saw answer about that - https://stackoverflow.com/questions/40739939/dropping-a-release-from-public-maven-central

Also indirect evidence is this answer (they had the same sha1 for several artifacts):
instead of changing the file - they release a new version

func (c *Crawler) skipChildLink(selection *goquery.Selection) bool {
if c.lastUpdate.IsZero() {
return false
}

fields := strings.Fields(selection.Get(0).NextSibling.Data)
if len(fields) == 0 || fields[0] == "-" {
return false
}
linkTime, err := time.Parse("2006-01-02", fields[0])
if err != nil {
slog.Warn("Unable to parse link time", slog.String("time", fields[0]))
return false
}

return linkTime.Before(c.lastUpdate)
}

func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error {
var foundVersions []Version
// Check each version dir to find links to `*.jar.sha1` files.
Expand Down Expand Up @@ -309,7 +336,11 @@ func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) {
// Don't include sources, test, javadocs, scaladoc files
if strings.HasSuffix(link, ".jar.sha1") && !strings.HasSuffix(link, "sources.jar.sha1") &&
!strings.HasSuffix(link, "test.jar.sha1") && !strings.HasSuffix(link, "tests.jar.sha1") &&
!strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") {
!strings.HasSuffix(link, "javadoc.jar.sha1") && !strings.HasSuffix(link, "scaladoc.jar.sha1") &&
// There are cases when version dir doesn't have date
// So we should check date of sha1 file
// e.g. https://repo.maven.apache.org/maven2/ant-contrib/cpptasks/1.0b3/cpptasks-1.0b3.jar.sha1
!c.skipChildLink(selection) {
sha1URLs = append(sha1URLs, url+link)
}
})
Expand Down
25 changes: 22 additions & 3 deletions pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os"
"path/filepath"
"testing"
"time"

"github.com/stretchr/testify/assert"

Expand All @@ -17,6 +18,7 @@ func TestCrawl(t *testing.T) {
tests := []struct {
name string
limit int64
lastUpdate time.Time
fileNames map[string]string
goldenPath string
filePath string
Expand All @@ -42,6 +44,22 @@ func TestCrawl(t *testing.T) {
goldenPath: "testdata/happy/abbot.json.golden",
filePath: "indexes/abbot/abbot.json",
},
{
name: "happy path with lastUpdate",
limit: 1,
lastUpdate: time.Date(2010, 01, 01, 01, 01, 01, 0, time.UTC),
fileNames: map[string]string{
"/maven2/": "testdata/happy/index.html",
"/maven2/abbot/": "testdata/happy/abbot.html",
"/maven2/abbot/abbot/": "testdata/happy/abbot_abbot.html",
"/maven2/abbot/abbot/maven-metadata.xml": "testdata/happy/maven-metadata.xml",
"/maven2/abbot/abbot/1.4.0/": "testdata/happy/abbot_abbot_1.4.0.html",
"/maven2/abbot/abbot/1.4.0/abbot-1.4.0.jar.sha1": "testdata/happy/abbot-1.4.0.jar.sha1",
"/maven2/abbot/abbot/1.4.0/abbot-1.4.0-lite.jar.sha1": "testdata/happy/abbot-1.4.0-lite.jar.sha1",
},
goldenPath: "testdata/happy/abbot-1.4.0.json.golden",
filePath: "indexes/abbot/abbot.json",
},
{
name: "sad path",
limit: 2,
Expand Down Expand Up @@ -77,9 +95,10 @@ func TestCrawl(t *testing.T) {

tmpDir := t.TempDir()
cl := crawler.NewCrawler(crawler.Option{
RootUrl: ts.URL + "/maven2/",
Limit: tt.limit,
CacheDir: tmpDir,
RootUrl: ts.URL + "/maven2/",
Limit: tt.limit,
CacheDir: tmpDir,
LastUpdate: tt.lastUpdate,
})

err := cl.Crawl(context.Background())
Expand Down
15 changes: 15 additions & 0 deletions pkg/crawler/testdata/happy/abbot-1.4.0.json.golden
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"GroupID": "abbot",
"ArtifactID": "abbot",
"Versions": [
{
"Version": "1.4.0-lite",
"SHA1": "BUerA3Bor6ICaSW9lL+5/Pzsl2E="
},
{
"Version": "1.4.0",
"SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM="
}
],
"ArchiveType": "jar"
}
14 changes: 12 additions & 2 deletions pkg/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,18 @@ func path(cacheDir string) string {
return filepath.Join(cacheDir, dbFileName)
}

func Reset(cacheDir string) error {
return os.RemoveAll(path(cacheDir))
func Dir(cacheDir string) string {
return filepath.Join(cacheDir, "db")
}

func Exists(cacheDir string) bool {
if _, err := os.Stat(path(cacheDir)); os.IsNotExist(err) {
return false
}
if _, err := os.Stat(metadataPath(cacheDir)); os.IsNotExist(err) {
return false
}
return true
}

func New(cacheDir string) (DB, error) {
Expand Down
15 changes: 14 additions & 1 deletion pkg/db/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ type Client struct {
path string
}

func metadataPath(cacheDir string) string {
return filepath.Join(cacheDir, metadataFile)
}

type Metadata struct {
Version int `json:",omitempty"`
NextUpdate time.Time
Expand All @@ -24,7 +28,7 @@ type Metadata struct {

func NewMetadata(cacheDir string) Client {
return Client{
path: filepath.Join(cacheDir, metadataFile),
path: metadataPath(cacheDir),
}
}

Expand Down Expand Up @@ -67,3 +71,12 @@ func (c *Client) Delete() error {
}
return nil
}

func GetMetadataUpdatedAt(cacheDir string) (time.Time, error) {
c := NewMetadata(cacheDir)
metadata, err := c.Get()
if err != nil {
return time.Time{}, xerrors.Errorf("unable to get metadata: %w", err)
}
return metadata.UpdatedAt, nil
}
Loading