Skip to content

Commit

Permalink
fix(crawler): Visit function should write error in errCh once (#36)
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitriyLewen authored Sep 25, 2024
1 parent 349526b commit 25b1b4b
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"path/filepath"
"strings"
"sync"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/hashicorp/go-retryablehttp"
Expand All @@ -32,6 +33,7 @@ type Crawler struct {
rootUrl string
wg sync.WaitGroup
urlCh chan string
errOnce sync.Once
limit *semaphore.Weighted
wrongSHA1Values []string
}
Expand All @@ -46,6 +48,7 @@ func NewCrawler(opt Option) Crawler {
client := retryablehttp.NewClient()
client.RetryMax = 10
client.Logger = nil
client.RetryWaitMin = 10 * time.Second

if opt.RootUrl == "" {
opt.RootUrl = mavenRepoURL
Expand All @@ -61,6 +64,7 @@ func NewCrawler(opt Option) Crawler {
rootUrl: opt.RootUrl,
urlCh: make(chan string, opt.Limit*10),
limit: semaphore.NewWeighted(opt.Limit),
errOnce: sync.Once{},
}
}

Expand Down Expand Up @@ -101,7 +105,13 @@ func (c *Crawler) Crawl(ctx context.Context) error {
defer c.limit.Release(1)
defer c.wg.Done()
if err := c.Visit(ctx, url); err != nil {
errCh <- xerrors.Errorf("visit error: %w", err)
// There might be a case where we get 2 errors at the same time.
// In this case we close `errCh` after reading the first error
// and get panic for the second error
// That's why we need to return the error once.
c.errOnce.Do(func() {
errCh <- xerrors.Errorf("visit error: %w", err)
})
}
}(url)
}
Expand Down

0 comments on commit 25b1b4b

Please sign in to comment.