Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Give Zeno the logging it deserves #66

Merged
merged 17 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ jobs/*
jobs/
Zeno
*.txt
*.sh
*.sh
zeno.log
27 changes: 25 additions & 2 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,33 @@ var GlobalFlags = []cli.Flag{
Usage: "If turned on, the crawler will send back URLs that hit a rate limit to crawl HQ.",
Destination: &config.App.Flags.HQRateLimitingSendBack,
},
// Logging flags
&cli.StringFlag{
Name: "log-file-output-dir",
Usage: "Directory to write log files to.",
Value: "jobs",
Destination: &config.App.Flags.LogFileOutputDir,
},
&cli.StringFlag{
Name: "es-url",
Usage: "ElasticSearch URL to use for indexing crawl logs.",
Destination: &config.App.Flags.ElasticSearchURL,
Usage: "comma-separated ElasticSearch URL to use for indexing crawl logs.",
Destination: &config.App.Flags.ElasticSearchURLs,
},
&cli.StringFlag{
Name: "es-user",
Usage: "ElasticSearch username to use for indexing crawl logs.",
Destination: &config.App.Flags.ElasticSearchUsername,
},
&cli.StringFlag{
Name: "es-password",
Usage: "ElasticSearch password to use for indexing crawl logs.",
Destination: &config.App.Flags.ElasticSearchPassword,
},
&cli.StringFlag{
Name: "es-index-prefix",
Usage: "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`",
Value: "zeno",
Destination: &config.App.Flags.ElasticSearchIndexPrefix,
},
&cli.StringSliceFlag{
Name: "exclude-string",
Expand Down
43 changes: 40 additions & 3 deletions cmd/utils.go
Original file line number Diff line number Diff line change
@@ -1,34 +1,71 @@
package cmd

import (
"fmt"
"log/slog"
"os"
"path"
"strings"
"time"

"github.com/google/uuid"
"github.com/internetarchive/Zeno/config"
"github.com/internetarchive/Zeno/internal/pkg/crawl"
"github.com/internetarchive/Zeno/internal/pkg/frontier"
"github.com/internetarchive/Zeno/internal/pkg/log"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/paulbellamy/ratecounter"
"github.com/sirupsen/logrus"
)

// InitCrawlWithCMD takes a config.Flags struct and return a
// *crawl.Crawl initialized with it
func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
var c = new(crawl.Crawl)

// Logger
var elasticSearchConfig *log.ElasticsearchConfig
elasticSearchURLs := strings.Split(flags.ElasticSearchURLs, ",")
if elasticSearchURLs[0] == "" {
elasticSearchConfig = nil
} else {
elasticSearchConfig = &log.ElasticsearchConfig{
Addresses: elasticSearchURLs,
Username: flags.ElasticSearchUsername,
Password: flags.ElasticSearchPassword,
IndexPrefix: flags.ElasticSearchIndexPrefix,
Level: slog.LevelDebug,
}
}

logFileOutput := &log.Logfile{
Dir: strings.TrimRight(flags.LogFileOutputDir, "/"),
Prefix: "zeno",
}
customLogger, err := log.New(log.Config{
FileOutput: logFileOutput,
FileLevel: slog.LevelDebug,
StdoutLevel: slog.LevelInfo,
RotateLogFile: true,
RotateElasticSearchIndex: true,
ElasticsearchConfig: elasticSearchConfig,
})
if err != nil {
fmt.Println(err)
os.Exit(1)
}
c.Log = customLogger

// Statistics counters
c.CrawledSeeds = new(ratecounter.Counter)
c.CrawledAssets = new(ratecounter.Counter)
c.ActiveWorkers = new(ratecounter.Counter)
c.URIsPerSecond = ratecounter.NewRateCounter(1 * time.Second)

c.LiveStats = flags.LiveStats
c.ElasticSearchURL = flags.ElasticSearchURL

// Frontier
c.Frontier = new(frontier.Frontier)
c.Frontier.Log = c.Log

// If the job name isn't specified, we generate a random name
if flags.Job == "" {
Expand All @@ -37,7 +74,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
} else {
UUID, err := uuid.NewUUID()
if err != nil {
logrus.Fatal(err)
c.Log.Fatal("cmd/utils.go:InitCrawlWithCMD():uuid.NewUUID()", "error", err)
}

c.Job = UUID.String()
Expand Down
9 changes: 6 additions & 3 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ type Flags struct {
DisableAssetsCapture bool
CertValidation bool

CloudflareStream bool
ElasticSearchURL string
ExcludedStrings cli.StringSlice
ElasticSearchURLs string
ElasticSearchUsername string
ElasticSearchPassword string
ElasticSearchIndexPrefix string
ExcludedStrings cli.StringSlice
LogFileOutputDir string
}

type Application struct {
Expand Down
10 changes: 9 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ require (
github.com/beeker1121/goque v2.1.0+incompatible
github.com/clbanning/mxj/v2 v2.7.0
github.com/dustin/go-humanize v1.0.1
github.com/elastic/go-elasticsearch v0.0.0
github.com/elastic/go-elasticsearch/v8 v8.14.0
github.com/gin-contrib/pprof v1.4.0
github.com/gin-gonic/gin v1.9.1
github.com/google/uuid v1.6.0
Expand Down Expand Up @@ -45,9 +47,12 @@ require (
github.com/cloudflare/circl v1.3.7 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.19.0 // indirect
Expand Down Expand Up @@ -88,10 +93,13 @@ require (
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
github.com/xrash/smetrics v0.0.0-20231213231151-1d8dd44e695e // indirect
go.opentelemetry.io/otel v1.24.0 // indirect
go.opentelemetry.io/otel/metric v1.24.0 // indirect
go.opentelemetry.io/otel/trace v1.24.0 // indirect
golang.org/x/arch v0.7.0 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/sync v0.6.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/sys v0.20.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
25 changes: 21 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+m
github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=
github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I=
github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA=
github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk=
github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWnBO0y+TZaA=
github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg=
github.com/elastic/go-elasticsearch/v8 v8.14.0 h1:1ywU8WFReLLcxE1WJqii3hTtbPUE2hc38ZK/j4mMFow=
github.com/elastic/go-elasticsearch/v8 v8.14.0/go.mod h1:WRvnlGkSuZyp83M2U8El/LGXpCjYLrvlkSgkAH4O5I4=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
Expand All @@ -71,8 +77,11 @@ github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ=
github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
Expand Down Expand Up @@ -300,6 +309,14 @@ github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
go.opencensus.io v0.20.1/go.mod h1:6WKK9ahsWS3RSO+PY9ZHZUfv2irvY6gN279GOPZjmmk=
go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo=
go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo=
go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI=
go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco=
go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8=
go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E=
go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI=
go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
Expand Down Expand Up @@ -361,8 +378,8 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
Expand Down
18 changes: 12 additions & 6 deletions internal/pkg/crawl/api.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package crawl

import (
"fmt"
"log/slog"
"os"
"strconv"
"time"
Expand All @@ -12,10 +14,12 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
)

// APIWorkersState represents the state of all API workers.
type APIWorkersState struct {
Workers []*APIWorkerState `json:"workers"`
}

// APIWorkerState represents the state of an API worker.
type APIWorkerState struct {
WorkerID uint `json:"worker_id"`
Status string `json:"status"`
Expand All @@ -24,15 +28,17 @@ type APIWorkerState struct {
Locked bool `json:"locked"`
}

// startAPI starts the API server for the crawl.
func (crawl *Crawl) startAPI() {
gin.SetMode(gin.ReleaseMode)
gin.DefaultWriter = logInfo.Out
gin.DefaultWriter = crawl.Log.Writer(slog.LevelInfo)
gin.DefaultErrorWriter = crawl.Log.Writer(slog.LevelError)

r := gin.Default()

pprof.Register(r)

logInfo.Info("Starting API")
crawl.Log.Info("Starting API")
r.GET("/", func(c *gin.Context) {
crawledSeeds := crawl.CrawledSeeds.Value()
crawledAssets := crawl.CrawledAssets.Value()
Expand All @@ -54,7 +60,7 @@ func (crawl *Crawl) startAPI() {
labels["crawljob"] = crawl.Job
hostname, err := os.Hostname()
if err != nil {
logWarning.Warn("Unable to retrieve hostname of machine")
crawl.Log.Warn("Unable to retrieve hostname of machine")
hostname = "unknown"
}
labels["host"] = hostname + ":" + crawl.APIPort
Expand All @@ -65,7 +71,7 @@ func (crawl *Crawl) startAPI() {
Help: "The total number of crawled URI",
})

logInfo.Info("Starting Prometheus export")
crawl.Log.Info("Starting Prometheus export")
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
}

Expand Down Expand Up @@ -95,8 +101,8 @@ func (crawl *Crawl) startAPI() {
c.JSON(200, workersState)
})

err := r.Run(":" + crawl.APIPort)
err := r.Run(fmt.Sprintf(":%s", crawl.APIPort))
if err != nil {
logError.Fatalf("unable to start API: %s", err.Error())
crawl.Log.Fatal("unable to start API", "error", err.Error())
}
}
4 changes: 2 additions & 2 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.D
if strings.Contains(base.Host, "cloudflarestream.com") {
cloudflarestreamURLs, err := cloudflarestream.GetSegments(base, *c.Client)
if err != nil {
logWarning.WithFields(c.genLogFields(err, item.URL, nil)).Warnln("error getting cloudflarestream segments")
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Warn("error getting cloudflarestream segments")
}

if len(cloudflarestreamURLs) > 0 {
Expand Down Expand Up @@ -144,7 +144,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.D
// Apply regex on the script's HTML to extract potential assets
outerHTML, err := goquery.OuterHtml(item)
if err != nil {
logWarning.Warning(err)
c.Log.Warn("crawl/assets.go:extractAssets():goquery.OuterHtml():", "error", err)
} else {
scriptLinks := utils.DedupeStrings(regexOutlinks.FindAllString(outerHTML, -1))
for _, scriptLink := range scriptLinks {
Expand Down
Loading
Loading