Skip to content

Commit

Permalink
nlnwa#3: merge and integrate previous commit
Browse files Browse the repository at this point in the history
The code was outdated and had to be modified to make sense with master
  • Loading branch information
Avokadoen committed Apr 20, 2021
2 parents c9a3be4 + 3d83710 commit 6a23714
Show file tree
Hide file tree
Showing 15 changed files with 311 additions and 147 deletions.
11 changes: 0 additions & 11 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.

**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]

**Smartphone (please complete the following information):**
- Device: [e.g. iPhone6]
- OS: [e.g. iOS8.1]
- Browser [e.g. stock browser, safari]
- Version [e.g. 22]

**Additional context**
Add any other context about the problem here.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.13 as build
FROM golang:1.15 as build

WORKDIR /build

Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The gowarc server module. This tool can be used to index and serve warc files

# Requirements

go 1.13 or newer
go 1.15 or newer

# Build

Expand All @@ -19,9 +19,9 @@ You can configure certain aspect of gowarcserver with a config file. Here are al

| Name | Type | Description | Default |
| ------------- | ------------- | ----------- | ------- |
| warcdir | List of paths | The path to directories where warcs that should be auto indexed | ["."] |
| indexdir | path | The root directory for index files | "." |
| autoindex | bool | Whether gowarc should index from the warcdir(s) when serving automatically or not | true |
| warcport | int | The port that the serve command will use if not overridden as argument to serve | 9999 |
| loglevel | string | Change the application log level manually | "info" |
| compression | string | Change the db table compression. Legal values are: 'none', 'snappy', 'zstd' | "none" |
| warcDir | List of paths | The path to directories where warcs that should be auto indexed | ["."] |
| indexDir | path | The root directory for index files | "." |
| autoIndex | bool | Whether gowarc should index from the warcdir(s) when serving automatically or not | true |
| warcPort | int | The port that the serve command will use if not overridden as argument to serve | 9999 |
| logLevel | string | Change the application log level manually | "info" |
| compression | string | Change the db table compression. Legal values are: 'none', 'snappy', 'zstd' | "none" |
46 changes: 10 additions & 36 deletions cmd/warcserver/cmd/index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,10 @@ package index
import (
"errors"
"fmt"
"io"
"os"
"strconv"

"github.com/nlnwa/gowarc/warcoptions"
"github.com/nlnwa/gowarc/warcreader"
"github.com/nlnwa/gowarcserver/pkg/index"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

func parseFormat(format string) (index.CdxWriter, error) {
Expand Down Expand Up @@ -57,8 +53,6 @@ func NewCommand() *cobra.Command {
if len(args) == 0 {
return errors.New("missing file name")
}
// TODO: maybe try to open file/directory here?
// default return should be an error case
return nil
},
RunE: func(cmd *cobra.Command, args []string) error {
Expand All @@ -69,11 +63,7 @@ func NewCommand() *cobra.Command {
return err
}

writer.Init()
defer writer.Close()
fmt.Printf("Format: %v\n", c.writerFormat)

return readFile(c.fileName, writer)
return runE(c, writer)
},
}

Expand All @@ -82,33 +72,17 @@ func NewCommand() *cobra.Command {
return cmd
}

func readFile(fileName string, writer index.CdxWriter) error {
opts := &warcoptions.WarcOptions{Strict: false}
wf, err := warcreader.NewWarcFilename(fileName, 0, opts)
func runE(c *conf, writer index.CdxWriter) error {
fmt.Printf("Format: %v\n", c.writerFormat)
compression := viper.GetString("compression")
dir := viper.GetString("indexdir")
dbConfig := index.NewDbConfig(compression, dir)
err := writer.Init(dbConfig)
if err != nil {
return err
}
defer wf.Close()

count := 0

// avoid defer copy value by using a anonymous function
// At the end, print count even if an error occurs
defer func() {
fmt.Fprintln(os.Stdout, "Count: ", count)
}()
defer writer.Close()

for {
wr, currentOffset, err := wf.Next()
if err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset)
}
count++

writer.Write(wr, fileName, currentOffset)
}
ReadFile(c, writer)
return nil
}
57 changes: 57 additions & 0 deletions cmd/warcserver/cmd/index/io.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package index

import (
"fmt"
"io"
"strconv"

"github.com/nlnwa/gowarc/warcoptions"
"github.com/nlnwa/gowarc/warcreader"
"github.com/nlnwa/gowarcserver/pkg/index"
logrus "github.com/sirupsen/logrus"
)

func ParseFormat(format string) (index.CdxWriter, error) {
switch format {
case "cdx":
return &index.CdxLegacy{}, nil
case "cdxj":
return &index.CdxJ{}, nil
case "cdxpb":
return &index.CdxPb{}, nil
case "db":
return &index.CdxDb{}, nil
}
return nil, fmt.Errorf("unknwon format %v, valid formats are: 'cdx', 'cdxj', 'cdxpb', 'db'", format)
}

func ReadFile(c *conf, writer index.CdxWriter) error {
opts := &warcoptions.WarcOptions{Strict: false}
wf, err := warcreader.NewWarcFilename(c.fileName, 0, opts)
if err != nil {
return err
}
defer wf.Close()

count := 0

// avoid defer copy value by using a anonymous function
// At the end, print count even if an error occurs
defer func() {
logrus.Printf("Count: %d", count)
}()

for {
wr, currentOffset, err := wf.Next()
if err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset)
}
count++

writer.Write(wr, c.fileName, currentOffset)
}
return nil
}
139 changes: 139 additions & 0 deletions cmd/warcserver/cmd/index/io_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package index

import (
"fmt"
"os"
"path"
"reflect"
"testing"

"github.com/nlnwa/gowarcserver/pkg/index"
log "github.com/sirupsen/logrus"
)

func TestParseFormat(t *testing.T) {
tests := []struct {
name string
format string
expected reflect.Type
errorState bool
}{
{
"'cdx' results in CdxLegacy writer",
"cdx",
reflect.TypeOf((*index.CdxLegacy)(nil)),
false,
},
{
"'cdxj' results in CdxJ writer",
"cdxj",
reflect.TypeOf((*index.CdxJ)(nil)),
false,
},
{
"'db' results in CdxDb writer",
"db",
reflect.TypeOf((*index.CdxDb)(nil)),
false,
},
{
"'cdxpb' results in CdxPd writer",
"cdxpb",
reflect.TypeOf((*index.CdxPb)(nil)),
false,
},
{
"'cd' results in error",
"cd",
nil,
true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := ParseFormat(tt.format)
if err != nil && !tt.errorState {
t.Errorf("Unexpected failure: %v", err)
} else if err == nil && tt.errorState {
t.Errorf("Expected error parsing '%v', got type %T", tt.format, got)
}

if reflect.TypeOf(got) != tt.expected {
t.Errorf("Expected %v, got %v", tt.expected, got)
}
})
}
}

// TODO: this was hard to write tests for and therefore ReadFile
// should probably be refactored
func TestReadFile(t *testing.T) {
log.SetLevel(log.WarnLevel)
// same as testdata/example.warc except removed gzip content because of illegal go str characters
testFileContent := []byte(`WARC/1.0
WARC-Date: 2017-03-06T04:03:53Z
WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>
WARC-Type: warcinfo
Content-Length: 0`)

filepath := path.Join(t.TempDir(), "test.warc")
file, err := os.Create(filepath)
if err != nil {
t.Fatalf("Failed to create testfile at '%s'", filepath)
}
// This is not strictly needed because of tmp, but to be platform agnostic it might be a good idea
defer file.Close()

_, err = file.Write(testFileContent)
if err != nil {
t.Fatalf("Failed to write to testfile at '%s'", filepath)
}

err = file.Sync()
if err != nil {
t.Fatalf("Failed to sync testfile at '%s'", filepath)
}

tests := []struct {
writerFormat string
writer index.CdxWriter
}{
{
"cdx",
&index.CdxLegacy{},
},
{
"cdxj",
&index.CdxJ{},
},
{

"cdxpd",
&index.CdxPb{},
},
{
"db",
&index.CdxDb{},
},
}

for _, tt := range tests {
testName := fmt.Sprintf("Readfile: %T successfully indexes", tt.writer)
t.Run(testName, func(t *testing.T) {
c := &conf{
filepath,
tt.writerFormat,
}
dbConfig := index.NewDbConfig("none", t.TempDir())
tt.writer.Init(dbConfig)
defer tt.writer.Close()

err := ReadFile(c, tt.writer)
if err != nil {
t.Errorf("Unexpected failure: %v", err)
}

})
}
}
Loading

0 comments on commit 6a23714

Please sign in to comment.