diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index a4d3e5a..29cb8e8 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - **Additional context** Add any other context about the problem here. diff --git a/Dockerfile b/Dockerfile index 83208eb..4a9d5fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.13 as build +FROM golang:1.15 as build WORKDIR /build diff --git a/README.md b/README.md index fe3bc4c..d61fa34 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The gowarc server module. This tool can be used to index and serve warc files # Requirements -go 1.13 or newer +go 1.15 or newer # Build @@ -19,9 +19,9 @@ You can configure certain aspect of gowarcserver with a config file. Here are al | Name | Type | Description | Default | | ------------- | ------------- | ----------- | ------- | -| warcdir | List of paths | The path to directories where warcs that should be auto indexed | ["."] | -| indexdir | path | The root directory for index files | "." | -| autoindex | bool | Whether gowarc should index from the warcdir(s) when serving automatically or not | true | -| warcport | int | The port that the serve command will use if not overridden as argument to serve | 9999 | -| loglevel | string | Change the application log level manually | "info" | -| compression | string | Change the db table compression. Legal values are: 'none', 'snappy', 'zstd' | "none" | +| warcDir | List of paths | The path to directories where warcs that should be auto indexed | ["."] | +| indexDir | path | The root directory for index files | "." | +| autoIndex | bool | Whether gowarc should index from the warcdir(s) when serving automatically or not | true | +| warcPort | int | The port that the serve command will use if not overridden as argument to serve | 9999 | +| logLevel | string | Change the application log level manually | "info" | +| compression | string | Change the db table compression. Legal values are: 'none', 'snappy', 'zstd' | "none" | \ No newline at end of file diff --git a/cmd/warcserver/cmd/index/index.go b/cmd/warcserver/cmd/index/index.go index 0235580..a4f8402 100644 --- a/cmd/warcserver/cmd/index/index.go +++ b/cmd/warcserver/cmd/index/index.go @@ -18,14 +18,10 @@ package index import ( "errors" "fmt" - "io" - "os" - "strconv" - "github.com/nlnwa/gowarc/warcoptions" - "github.com/nlnwa/gowarc/warcreader" "github.com/nlnwa/gowarcserver/pkg/index" "github.com/spf13/cobra" + "github.com/spf13/viper" ) func parseFormat(format string) (index.CdxWriter, error) { @@ -57,8 +53,6 @@ func NewCommand() *cobra.Command { if len(args) == 0 { return errors.New("missing file name") } - // TODO: maybe try to open file/directory here? - // default return should be an error case return nil }, RunE: func(cmd *cobra.Command, args []string) error { @@ -69,11 +63,7 @@ func NewCommand() *cobra.Command { return err } - writer.Init() - defer writer.Close() - fmt.Printf("Format: %v\n", c.writerFormat) - - return readFile(c.fileName, writer) + return runE(c, writer) }, } @@ -82,33 +72,17 @@ func NewCommand() *cobra.Command { return cmd } -func readFile(fileName string, writer index.CdxWriter) error { - opts := &warcoptions.WarcOptions{Strict: false} - wf, err := warcreader.NewWarcFilename(fileName, 0, opts) +func runE(c *conf, writer index.CdxWriter) error { + fmt.Printf("Format: %v\n", c.writerFormat) + compression := viper.GetString("compression") + dir := viper.GetString("indexdir") + dbConfig := index.NewDbConfig(compression, dir) + err := writer.Init(dbConfig) if err != nil { return err } - defer wf.Close() - - count := 0 - - // avoid defer copy value by using a anonymous function - // At the end, print count even if an error occurs - defer func() { - fmt.Fprintln(os.Stdout, "Count: ", count) - }() + defer writer.Close() - for { - wr, currentOffset, err := wf.Next() - if err == io.EOF { - break - } - if err != nil { - return fmt.Errorf("Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset) - } - count++ - - writer.Write(wr, fileName, currentOffset) - } + ReadFile(c, writer) return nil } diff --git a/cmd/warcserver/cmd/index/io.go b/cmd/warcserver/cmd/index/io.go new file mode 100644 index 0000000..07b673f --- /dev/null +++ b/cmd/warcserver/cmd/index/io.go @@ -0,0 +1,57 @@ +package index + +import ( + "fmt" + "io" + "strconv" + + "github.com/nlnwa/gowarc/warcoptions" + "github.com/nlnwa/gowarc/warcreader" + "github.com/nlnwa/gowarcserver/pkg/index" + logrus "github.com/sirupsen/logrus" +) + +func ParseFormat(format string) (index.CdxWriter, error) { + switch format { + case "cdx": + return &index.CdxLegacy{}, nil + case "cdxj": + return &index.CdxJ{}, nil + case "cdxpb": + return &index.CdxPb{}, nil + case "db": + return &index.CdxDb{}, nil + } + return nil, fmt.Errorf("unknwon format %v, valid formats are: 'cdx', 'cdxj', 'cdxpb', 'db'", format) +} + +func ReadFile(c *conf, writer index.CdxWriter) error { + opts := &warcoptions.WarcOptions{Strict: false} + wf, err := warcreader.NewWarcFilename(c.fileName, 0, opts) + if err != nil { + return err + } + defer wf.Close() + + count := 0 + + // avoid defer copy value by using a anonymous function + // At the end, print count even if an error occurs + defer func() { + logrus.Printf("Count: %d", count) + }() + + for { + wr, currentOffset, err := wf.Next() + if err == io.EOF { + break + } + if err != nil { + return fmt.Errorf("Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset) + } + count++ + + writer.Write(wr, c.fileName, currentOffset) + } + return nil +} diff --git a/cmd/warcserver/cmd/index/io_test.go b/cmd/warcserver/cmd/index/io_test.go new file mode 100644 index 0000000..481acd9 --- /dev/null +++ b/cmd/warcserver/cmd/index/io_test.go @@ -0,0 +1,139 @@ +package index + +import ( + "fmt" + "os" + "path" + "reflect" + "testing" + + "github.com/nlnwa/gowarcserver/pkg/index" + log "github.com/sirupsen/logrus" +) + +func TestParseFormat(t *testing.T) { + tests := []struct { + name string + format string + expected reflect.Type + errorState bool + }{ + { + "'cdx' results in CdxLegacy writer", + "cdx", + reflect.TypeOf((*index.CdxLegacy)(nil)), + false, + }, + { + "'cdxj' results in CdxJ writer", + "cdxj", + reflect.TypeOf((*index.CdxJ)(nil)), + false, + }, + { + "'db' results in CdxDb writer", + "db", + reflect.TypeOf((*index.CdxDb)(nil)), + false, + }, + { + "'cdxpb' results in CdxPd writer", + "cdxpb", + reflect.TypeOf((*index.CdxPb)(nil)), + false, + }, + { + "'cd' results in error", + "cd", + nil, + true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseFormat(tt.format) + if err != nil && !tt.errorState { + t.Errorf("Unexpected failure: %v", err) + } else if err == nil && tt.errorState { + t.Errorf("Expected error parsing '%v', got type %T", tt.format, got) + } + + if reflect.TypeOf(got) != tt.expected { + t.Errorf("Expected %v, got %v", tt.expected, got) + } + }) + } +} + +// TODO: this was hard to write tests for and therefore ReadFile +// should probably be refactored +func TestReadFile(t *testing.T) { + log.SetLevel(log.WarnLevel) + // same as testdata/example.warc except removed gzip content because of illegal go str characters + testFileContent := []byte(`WARC/1.0 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Record-ID: +WARC-Type: warcinfo +Content-Length: 0`) + + filepath := path.Join(t.TempDir(), "test.warc") + file, err := os.Create(filepath) + if err != nil { + t.Fatalf("Failed to create testfile at '%s'", filepath) + } + // This is not strictly needed because of tmp, but to be platform agnostic it might be a good idea + defer file.Close() + + _, err = file.Write(testFileContent) + if err != nil { + t.Fatalf("Failed to write to testfile at '%s'", filepath) + } + + err = file.Sync() + if err != nil { + t.Fatalf("Failed to sync testfile at '%s'", filepath) + } + + tests := []struct { + writerFormat string + writer index.CdxWriter + }{ + { + "cdx", + &index.CdxLegacy{}, + }, + { + "cdxj", + &index.CdxJ{}, + }, + { + + "cdxpd", + &index.CdxPb{}, + }, + { + "db", + &index.CdxDb{}, + }, + } + + for _, tt := range tests { + testName := fmt.Sprintf("Readfile: %T successfully indexes", tt.writer) + t.Run(testName, func(t *testing.T) { + c := &conf{ + filepath, + tt.writerFormat, + } + dbConfig := index.NewDbConfig("none", t.TempDir()) + tt.writer.Init(dbConfig) + defer tt.writer.Close() + + err := ReadFile(c, tt.writer) + if err != nil { + t.Errorf("Unexpected failure: %v", err) + } + + }) + } +} diff --git a/cmd/warcserver/cmd/root.go b/cmd/warcserver/cmd/root.go index 90ab96c..272b93f 100644 --- a/cmd/warcserver/cmd/root.go +++ b/cmd/warcserver/cmd/root.go @@ -27,17 +27,10 @@ import ( "github.com/spf13/viper" ) -type conf struct { - cfgFile string - logLevel string - - // DB settings - compression string -} - // NewCommand returns a new cobra.Command implementing the root command for warc func NewCommand() *cobra.Command { - c := &conf{} + cobra.OnInitialize(func() { initConfig() }) + cmd := &cobra.Command{ Use: "warcserver", Short: "Server capable of indexing and serving warc files", @@ -48,27 +41,32 @@ func NewCommand() *cobra.Command { // https://github.com/dgraph-io/badger#are-there-any-go-specific-settings-that-i-should-use runtime.GOMAXPROCS(128) - level, err := log.ParseLevel(c.logLevel) + logLevel := viper.GetString("logLevel") + level, err := log.ParseLevel(logLevel) if err != nil { - return fmt.Errorf("'%s' is not part of the valid levels: 'panic', 'fatal', 'error', 'warn', 'warning', 'info', 'debug', 'trace'", c.logLevel) + return fmt.Errorf("'%s' is not part of the valid levels: 'panic', 'fatal', 'error', 'warn', 'warning', 'info', 'debug', 'trace'", logLevel) } - log.SetLevel(level) + return nil }, } - cobra.OnInitialize(func() { c.initConfig() }) + // Stub to store cobra variables + c := &struct { + cfgFile string + logLevel string + compression string + }{} // Flags cmd.PersistentFlags().StringVarP(&c.compression, "compression", "c", "none", "DB compression type: 'none', 'snappy', 'zstd'") - cmd.PersistentFlags().StringVarP(&c.logLevel, "loglevel", "l", "info", "set the log level of gowarc, it will take precedence over config 'loglevel'") + cmd.PersistentFlags().StringVarP(&c.logLevel, "logLevel", "l", "info", "set the log level of gowarc, it will take precedence over config 'loglevel'") cmd.PersistentFlags().StringVar(&c.cfgFile, "config", "", "config file. If not set, /etc/warc/, $HOME/.warc/ and current working dir will be searched for file config.yaml") - // bind flags and config - viper.BindPFlag("compression", cmd.PersistentFlags().Lookup("compression")) - viper.BindPFlag("loglevel", cmd.PersistentFlags().Lookup("loglevel")) - viper.BindPFlag("config", cmd.PersistentFlags().Lookup("config")) + if err := viper.BindPFlags(cmd.PersistentFlags()); err != nil { + log.Fatalf("Failed to bind root flags, err: %v", err) + } // Subcommands cmd.AddCommand(serve.NewCommand()) @@ -78,14 +76,10 @@ func NewCommand() *cobra.Command { } // initConfig reads in config file and ENV variables if set. -func (c *conf) initConfig() { +func initConfig() { viper.SetTypeByDefaultValue(true) - viper.SetDefault("warcdir", []string{"."}) - viper.SetDefault("indexdir", ".") - viper.SetDefault("autoindex", true) - viper.SetDefault("warcport", 9999) - viper.SetDefault("loglevel", "info") - viper.SetDefault("compression", "none") + viper.SetDefault("warcDir", []string{"."}) + viper.SetDefault("indexDir", ".") viper.AutomaticEnv() // read in environment variables that match diff --git a/cmd/warcserver/cmd/serve/serve.go b/cmd/warcserver/cmd/serve/serve.go index 33a12db..9b62bbf 100644 --- a/cmd/warcserver/cmd/serve/serve.go +++ b/cmd/warcserver/cmd/serve/serve.go @@ -23,52 +23,60 @@ import ( "github.com/spf13/viper" ) -type conf struct { - port int - warcDirs []string - watchDepth int -} - func NewCommand() *cobra.Command { - c := &conf{} var cmd = &cobra.Command{ Use: "serve", Short: "Start the warc server to serve warc records", Long: ``, RunE: func(cmd *cobra.Command, args []string) error { + var warcDirs []string if len(args) > 0 { - c.warcDirs = args + warcDirs = args } else { - c.warcDirs = viper.GetStringSlice("warcdir") + warcDirs = viper.GetStringSlice("warcDir") } - return runE(c) + return runE(warcDirs) }, } - cmd.Flags().IntVarP(&c.port, "port", "p", -1, "the port that should be used to serve, will use config value otherwise") - cmd.Flags().IntVarP(&c.watchDepth, "watch-depth", "w", 4, "The maximum depth when indexing warc") + // Stub to hold flags + c := &struct { + warcPort int + watchDepth int + autoIndex bool + }{} + cmd.Flags().IntVarP(&c.warcPort, "warcPort", "p", 9999, "Port that should be used to serve, will use config value otherwise") + cmd.Flags().IntVarP(&c.watchDepth, "watchDepth", "w", 4, "Maximum depth when indexing warc") + cmd.Flags().BoolVarP(&c.autoIndex, "autoIndex", "a", true, "Whether the server should index warc files automatically") + if err := viper.BindPFlags(cmd.Flags()); err != nil { + log.Fatalf("Failed to bind serve flags, err: %v", err) + } return cmd } -func runE(c *conf) error { - if c.port < 0 { - c.port = viper.GetInt("warcport") - } - - db, err := index.DbFromViper() +func runE(warcDirs []string) error { + compression := viper.GetString("compression") + dir := viper.GetString("indexdir") + dbConfig := index.NewDbConfig(compression, dir) + db, err := index.DbFromConfig(dbConfig) if err != nil { return err } defer db.Close() - if viper.GetBool("autoindex") { - log.Infof("Starting autoindexer") - autoindexer := index.NewAutoIndexer(db, c.warcDirs, c.watchDepth) + if viper.GetBool("autoIndex") { + log.Infof("Starting auto indexer") + watchDepth := viper.GetInt("watchDepth") + autoindexer := index.NewAutoIndexer(db, warcDirs, watchDepth) defer autoindexer.Shutdown() } - log.Infof("Starting web server at http://localhost:%v", c.port) - server.Serve(db, c.port) + port := viper.GetInt("warcPort") + log.Infof("Starting web server at http://localhost:%v", port) + err = server.Serve(db, port) + if err != nil { + log.Warnf("%v", err) + } return nil } diff --git a/exampleconfig.yaml b/exampleconfig.yaml index e69de29..932ba60 100644 --- a/exampleconfig.yaml +++ b/exampleconfig.yaml @@ -0,0 +1 @@ +logLevel: warn \ No newline at end of file diff --git a/go.mod b/go.mod index e364661..f6517c1 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/nlnwa/gowarcserver -go 1.13 +go 1.15 require ( github.com/dgraph-io/badger/v2 v2.2007.2 diff --git a/pkg/index/dbfromviper.go b/pkg/index/dbfromconfig.go similarity index 54% rename from pkg/index/dbfromviper.go rename to pkg/index/dbfromconfig.go index d191752..2286958 100644 --- a/pkg/index/dbfromviper.go +++ b/pkg/index/dbfromconfig.go @@ -8,20 +8,26 @@ package index import ( "github.com/dgraph-io/badger/v2/options" "github.com/nlnwa/gowarcserver/pkg/compressiontype" - "github.com/spf13/viper" ) +type DbConfig struct { + compression string + dir string +} + +func NewDbConfig(compresion string, dir string) *DbConfig { + return &DbConfig{compression: compresion, dir: dir} +} + // TODO: test somehow? // Create a database based on the viper settings set by the user -func DbFromViper() (*Db, error) { - compressionString := viper.GetString("compression") - compression, cErr := compressiontype.FromString(compressionString) +func DbFromConfig(config *DbConfig) (*Db, error) { + compression, cErr := compressiontype.FromString(config.compression) if cErr != nil { return nil, cErr } - dbDir := viper.GetString("indexdir") - db, dbErr := NewIndexDb(dbDir, options.CompressionType(compression)) + db, dbErr := NewIndexDb(config.dir, options.CompressionType(compression)) if dbErr != nil { return nil, dbErr } diff --git a/pkg/index/indexwriter.go b/pkg/index/indexwriter.go index 245764c..5c75cf2 100644 --- a/pkg/index/indexwriter.go +++ b/pkg/index/indexwriter.go @@ -25,30 +25,53 @@ import ( ) type CdxWriter interface { - Init() error + Init(config *DbConfig) error Close() Write(wr warcrecord.WarcRecord, fileName string, offset int64) error } type CdxLegacy struct { } +type CdxJ struct { + jsonMarshaler *jsonpb.Marshaler +} +type CdxPb struct { + jsonMarshaler *jsonpb.Marshaler +} +type CdxDb struct { + db *Db +} -func (c *CdxLegacy) Init() error { +func (c *CdxDb) Init(config *DbConfig) (err error) { + db, err := DbFromConfig(config) + if err != nil { + return err + } + c.db = db return nil } -func (c *CdxLegacy) Close() { +func (c *CdxDb) Close() { + c.db.Flush() + c.db.Close() } -func (c *CdxLegacy) Write(wr warcrecord.WarcRecord, fileName string, offset int64) error { +func (c *CdxDb) Write(wr warcrecord.WarcRecord, fileName string, offset int64) error { + return c.db.Add(wr, fileName, offset) +} + +func (c *CdxLegacy) Init(config *DbConfig) (err error) { return nil } -type CdxJ struct { - jsonMarshaler *jsonpb.Marshaler +func (c *CdxLegacy) Close() { } -func (c *CdxJ) Init() (err error) { +func (c *CdxLegacy) Write(wr warcrecord.WarcRecord, fileName string, offset int64) error { + return nil +} + +func (c *CdxJ) Init(config *DbConfig) (err error) { c.jsonMarshaler = &jsonpb.Marshaler{} return nil } @@ -68,11 +91,7 @@ func (c *CdxJ) Write(wr warcrecord.WarcRecord, fileName string, offset int64) er return nil } -type CdxPb struct { - jsonMarshaler *jsonpb.Marshaler -} - -func (c *CdxPb) Init() (err error) { +func (c *CdxPb) Init(config *DbConfig) (err error) { c.jsonMarshaler = &jsonpb.Marshaler{} return nil } @@ -91,25 +110,3 @@ func (c *CdxPb) Write(wr warcrecord.WarcRecord, fileName string, offset int64) e } return nil } - -type CdxDb struct { - db *Db -} - -func (cdxdb *CdxDb) Init() error { - db, err := DbFromViper() - if err != nil { - return err - } - cdxdb.db = db - return nil -} - -func (c *CdxDb) Close() { - c.db.Flush() - c.db.Close() -} - -func (c *CdxDb) Write(wr warcrecord.WarcRecord, fileName string, offset int64) error { - return c.db.Add(wr, fileName, offset) -} diff --git a/pkg/server/server.go b/pkg/server/server.go index 36a742b..75aa3ec 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -31,10 +31,9 @@ import ( "github.com/nlnwa/gowarcserver/pkg/index" "github.com/nlnwa/gowarcserver/pkg/loader" "github.com/nlnwa/gowarcserver/pkg/server/warcserver" - log "github.com/sirupsen/logrus" ) -func Serve(db *index.Db, port int) { +func Serve(db *index.Db, port int) error { l := &loader.Loader{ Resolver: &storageRefResolver{db: db}, Loader: &loader.FileStorageLoader{FilePathResolver: func(fileName string) (filePath string, err error) { @@ -71,7 +70,7 @@ func Serve(db *index.Db, port int) { httpServer.Shutdown(ctx) }() - log.Info(httpServer.ListenAndServe()) + return httpServer.ListenAndServe() } type storageRefResolver struct { diff --git a/testdata/example-trunc.warc b/testdata/example-trunc.warc index 6a1e735..43ca9c2 100644 Binary files a/testdata/example-trunc.warc and b/testdata/example-trunc.warc differ diff --git a/testdata/example.warc b/testdata/example.warc index 4bc3089..41edf11 100644 Binary files a/testdata/example.warc and b/testdata/example.warc differ