From 69c1330a293f124571bb4833040160787df84dc0 Mon Sep 17 00:00:00 2001 From: knqyf263 Date: Thu, 26 Sep 2024 10:48:47 +0400 Subject: [PATCH 1/4] feat: add jitter for retries Signed-off-by: knqyf263 --- pkg/crawler/crawler.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 0e58f8e..0637b42 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -49,19 +49,21 @@ func NewCrawler(opt Option) Crawler { client.RetryMax = 10 client.Logger = slog.Default() client.RetryWaitMin = 10 * time.Second - client.Backoff = func(min, max time.Duration, attemptNum int, resp *http.Response) time.Duration { - // Maven Central returns "Retry-After: 0" for some reason, resulting in an immediate retry. - if resp.Header.Get("Retry-After") == "0" { - resp.Header.Del("Retry-After") + client.RetryWaitMax = 1 * time.Minute + client.Backoff = retryablehttp.LinearJitterBackoff + client.ResponseLogHook = func(_ retryablehttp.Logger, resp *http.Response) { + if resp.StatusCode != http.StatusOK { + slog.Warn("Unexpected http response", slog.String("url", resp.Request.URL.String()), slog.String("status", resp.Status)) } - return retryablehttp.DefaultBackoff(min, max, attemptNum, resp) } client.ErrorHandler = func(resp *http.Response, err error, numTries int) (*http.Response, error) { - if resp.StatusCode != http.StatusOK { - slog.Error("HTTP error", slog.String("url", resp.Request.URL.String()), slog.Int("num_tries", numTries), - slog.Int("status_code", resp.StatusCode)) + logger := slog.With(slog.String("url", resp.Request.URL.String()), slog.Int("status_code", resp.StatusCode), + slog.Int("num_tries", numTries)) + if err != nil { + logger = logger.With(slog.String("error", err.Error())) } - return resp, err + logger.Error("HTTP request failed after retries") + return resp, xerrors.Errorf("HTTP request failed after retries: %w", err) } if opt.RootUrl == "" { From e57eb681cffde18ad47dd5f90cd8b926961468a3 Mon Sep 17 00:00:00 2001 From: knqyf263 Date: Thu, 26 Sep 2024 11:24:53 +0400 Subject: [PATCH 2/4] fix: increase wait Signed-off-by: knqyf263 --- pkg/crawler/crawler.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 0637b42..a40997f 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -48,8 +48,8 @@ func NewCrawler(opt Option) Crawler { client := retryablehttp.NewClient() client.RetryMax = 10 client.Logger = slog.Default() - client.RetryWaitMin = 10 * time.Second - client.RetryWaitMax = 1 * time.Minute + client.RetryWaitMin = 1 * time.Minute + client.RetryWaitMax = 5 * time.Minute client.Backoff = retryablehttp.LinearJitterBackoff client.ResponseLogHook = func(_ retryablehttp.Logger, resp *http.Response) { if resp.StatusCode != http.StatusOK { From 43c6ef700d17f04538337aba46d355da9a31c73c Mon Sep 17 00:00:00 2001 From: knqyf263 Date: Thu, 26 Sep 2024 11:25:29 +0400 Subject: [PATCH 3/4] feat: add jitter between http requests Signed-off-by: knqyf263 --- pkg/crawler/crawler.go | 54 +++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index a40997f..cfe3fa6 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "log/slog" + "math/rand" "net/http" "path/filepath" "strings" @@ -156,13 +157,9 @@ loop: } func (c *Crawler) Visit(ctx context.Context, url string) error { - req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return xerrors.Errorf("unable to new HTTP request: %w", err) - } - resp, err := c.http.Do(req) + resp, err := c.httpGet(ctx, url) if err != nil { - return xerrors.Errorf("http get error (%s): %w", url, err) + return xerrors.Errorf("http get error: %w", err) } defer resp.Body.Close() @@ -290,13 +287,9 @@ func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, } func (c *Crawler) sha1Urls(ctx context.Context, url string) ([]string, error) { - req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return nil, xerrors.Errorf("unable to new HTTP request: %w", err) - } - resp, err := c.http.Do(req) + resp, err := c.httpGet(ctx, url) if err != nil { - return nil, xerrors.Errorf("http get error (%s): %w", url, err) + return nil, xerrors.Errorf("http get error: %w", err) } defer func() { _ = resp.Body.Close() }() @@ -328,13 +321,9 @@ func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, err return nil, nil } - req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) + resp, err := c.httpGet(ctx, url) if err != nil { - return nil, xerrors.Errorf("unable to new HTTP request: %w", err) - } - resp, err := c.http.Do(req) - if err != nil { - return nil, xerrors.Errorf("http get error (%s): %w", url, err) + return nil, xerrors.Errorf("http get error: %w", err) } defer resp.Body.Close() @@ -363,13 +352,9 @@ func (c *Crawler) parseMetadata(ctx context.Context, url string) (*Metadata, err } func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) { - req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) + resp, err := c.httpGet(ctx, url) if err != nil { - return nil, xerrors.Errorf("unable to new HTTP request: %w", err) - } - resp, err := c.http.Do(req) - if err != nil { - return nil, xerrors.Errorf("http get error (%s): %w", url, err) + return nil, xerrors.Errorf("http get error: %w", err) } defer func() { _ = resp.Body.Close() }() @@ -408,6 +393,27 @@ func (c *Crawler) fetchSHA1(ctx context.Context, url string) ([]byte, error) { return sha1b, nil } +func (c *Crawler) httpGet(ctx context.Context, url string) (*http.Response, error) { + // Sleep for a while to avoid 429 error + randomSleep() + + req, err := retryablehttp.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, xerrors.Errorf("unable to create a HTTP request: %w", err) + } + resp, err := c.http.Do(req) + if err != nil { + return nil, xerrors.Errorf("http error (%s): %w", url, err) + } + return resp, nil +} + +func randomSleep() { + // Seed rand + r := rand.New(rand.NewSource(int64(time.Now().Nanosecond()))) + time.Sleep(time.Duration(r.Float64() * float64(100*time.Millisecond))) +} + func versionFromSha1URL(artifactId, sha1URL string) string { ss := strings.Split(sha1URL, "/") fileName := ss[len(ss)-1] From c127aedb9363842c54ce0cde04cf0130f0b2c8aa Mon Sep 17 00:00:00 2001 From: knqyf263 Date: Thu, 26 Sep 2024 13:58:23 +0400 Subject: [PATCH 4/4] fix: less parallelism Signed-off-by: knqyf263 --- cmd/trivy-java-db/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/trivy-java-db/main.go b/cmd/trivy-java-db/main.go index 42cfb27..889c3ed 100644 --- a/cmd/trivy-java-db/main.go +++ b/cmd/trivy-java-db/main.go @@ -57,7 +57,7 @@ func init() { rootCmd.PersistentFlags().StringVar(&cacheDir, "cache-dir", filepath.Join(userCacheDir, "trivy-java-db"), "cache dir") - rootCmd.PersistentFlags().IntVar(&limit, "limit", 1000, "max parallelism") + rootCmd.PersistentFlags().IntVar(&limit, "limit", 300, "max parallelism") rootCmd.AddCommand(crawlCmd) rootCmd.AddCommand(buildCmd)