diff --git a/cmd/warcserver/cmd/index/index.go b/cmd/warcserver/cmd/index/index.go index 9064dd6..2a244cc 100644 --- a/cmd/warcserver/cmd/index/index.go +++ b/cmd/warcserver/cmd/index/index.go @@ -18,12 +18,7 @@ package index import ( "errors" "fmt" - "io" - "os" - "strconv" - "github.com/nlnwa/gowarc/warcoptions" - "github.com/nlnwa/gowarc/warcreader" "github.com/nlnwa/gowarcserver/pkg/index" "github.com/spf13/cobra" ) @@ -84,33 +79,6 @@ func runE(c *conf) error { } defer c.writer.Close() - readFile(c) + ReadFile(c) return nil } - -// TODO: return error -func readFile(c *conf) { - opts := &warcoptions.WarcOptions{Strict: false} - wf, err := warcreader.NewWarcFilename(c.fileName, 0, opts) - if err != nil { - return - } - defer wf.Close() - - count := 0 - - for { - wr, currentOffset, err := wf.Next() - if err == io.EOF { - break - } - if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset) - break - } - count++ - - c.writer.Write(wr, c.fileName, currentOffset) - } - fmt.Fprintln(os.Stderr, "Count: ", count) -} diff --git a/cmd/warcserver/cmd/index/io.go b/cmd/warcserver/cmd/index/io.go new file mode 100644 index 0000000..e5cdc23 --- /dev/null +++ b/cmd/warcserver/cmd/index/io.go @@ -0,0 +1,57 @@ +package index + +import ( + "fmt" + "io" + "strconv" + + "github.com/nlnwa/gowarc/warcoptions" + "github.com/nlnwa/gowarc/warcreader" + "github.com/nlnwa/gowarcserver/pkg/index" + logrus "github.com/sirupsen/logrus" +) + +func ParseFormat(format string) (index.CdxWriter, error) { + switch format { + case "cdx": + return &index.CdxLegacy{}, nil + case "cdxj": + return &index.CdxJ{}, nil + case "cdxpb": + return &index.CdxPb{}, nil + case "db": + return &index.CdxDb{}, nil + } + return nil, fmt.Errorf("unknwon format %v, valid formats are: 'cdx', 'cdxj', 'cdxpb', 'db'", format) +} + +func ReadFile(c *conf) error { + opts := &warcoptions.WarcOptions{Strict: false} + wf, err := warcreader.NewWarcFilename(c.fileName, 0, opts) + if err != nil { + return err + } + defer wf.Close() + + count := 0 + + // avoid defer copy value by using a anonymous function + // At the end, print count even if an error occurs + defer func() { + logrus.Printf("Count: %d", count) + }() + + for { + wr, currentOffset, err := wf.Next() + if err == io.EOF { + break + } + if err != nil { + return fmt.Errorf("Error: %v, rec num: %v, Offset %v\n", err.Error(), strconv.Itoa(count), currentOffset) + } + count++ + + c.writer.Write(wr, c.fileName, currentOffset) + } + return nil +} diff --git a/cmd/warcserver/cmd/index/io_test.go b/cmd/warcserver/cmd/index/io_test.go new file mode 100644 index 0000000..ba21a06 --- /dev/null +++ b/cmd/warcserver/cmd/index/io_test.go @@ -0,0 +1,234 @@ +package index + +import ( + "fmt" + "os" + "reflect" + "testing" + + "github.com/nlnwa/gowarcserver/pkg/index" + log "github.com/sirupsen/logrus" +) + +func TestParseFormat(t *testing.T) { + tests := []struct { + name string + format string + expected reflect.Type + errorState bool + }{ + { + "'cdx' results in CdxLegacy writer", + "cdx", + reflect.TypeOf((*index.CdxLegacy)(nil)), + false, + }, + { + "'cdxj' results in CdxJ writer", + "cdxj", + reflect.TypeOf((*index.CdxJ)(nil)), + false, + }, + { + "'db' results in CdxDb writer", + "db", + reflect.TypeOf((*index.CdxDb)(nil)), + false, + }, + { + "'cdxpb' results in CdxPd writer", + "cdxpb", + reflect.TypeOf((*index.CdxPb)(nil)), + false, + }, + { + "'cd' results in error", + "cd", + nil, + true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseFormat(tt.format) + if err != nil && !tt.errorState { + t.Errorf("Unexpected failure: %v", err) + } else if err == nil && tt.errorState { + t.Errorf("Expected error parsing '%v', got type %T", tt.format, got) + } + + if reflect.TypeOf(got) != tt.expected { + t.Errorf("Expected %v, got %v", tt.expected, got) + } + }) + } +} + +// TODO: this was hard to write tests for and therefore ReadFile +// should probably be refactored +func TestReadFile(t *testing.T) { + log.SetLevel(log.WarnLevel) + // same as testdata/example.warc except removed gzip content because of illegal go str characters + testFileContent := []byte(`WARC/1.0 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Record-ID: +WARC-Filename: temp-20170306040353.warc.gz +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 249 + +software: Webrecorder Platform v3.7 +format: WARC File Format 1.0 +creator: temp-MJFXHZ4S +isPartOf: Temporary%20Collection +json-metadata: {"title": "Temporary Collection", "size": 2865, "created_at": 1488772924, "type": "collection", "desc": ""} + + +WARC/1.0 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Record-ID: +WARC-Filename: temp-20170306040353.warc.gz +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 470 + +software: Webrecorder Platform v3.7 +format: WARC File Format 1.0 +creator: temp-MJFXHZ4S +isPartOf: Temporary%20Collection/Recording%20Session +json-metadata: {"created_at": 1488772924, "type": "recording", "updated_at": 1488773028, "title": "Recording Session", "size": 2865, "pages": [{"url": "http://example.com/", "title": "Example Domain", "timestamp": "20170306040348"}, {"url": "http://example.com/", "title": "Example Domain", "timestamp": "20170306040206"}]} + + +WARC/1.0 +WARC-Target-URI: http://example.com/ +WARC-Date: 2017-03-06T04:02:06Z +WARC-Type: response +WARC-Record-ID: +WARC-IP-Address: 93.184.216.34 +WARC-Block-Digest: sha1:DR5MBP7OD3OPA7RFKWJUD4CTNUQUGFC5 +WARC-Payload-Digest: sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK +Content-Type: application/http; msgtype=response +Content-Length: 975 + +HTTP/1.1 200 OK +Content-Encoding: gzip +Accept-Ranges: bytes +Cache-Control: max-age=604800 +Content-Type: text/html +Date: Mon, 06 Mar 2017 04:02:06 GMT +Etag: "359670651+gzip" +Expires: Mon, 13 Mar 2017 04:02:06 GMT +Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT +Server: ECS (iad/182A) +Vary: Accept-Encoding +X-Cache: HIT +Content-Length: 606 +Connection: close + + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Target-URI: http://example.com/ +WARC-Date: 2017-03-06T04:02:06Z +WARC-Concurrent-To: +Content-Type: application/http; msgtype=request +Content-Length: 493 + +GET / HTTP/1.0 +Host: example.com +Accept-Language: en-US,en;q=0.8,ru;q=0.6 +Referer: https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/ +Upgrade-Insecure-Requests: 1 +Connection: close +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +Dnt: 1 +Accept-Encoding: gzip, deflate, sdch, br + + + +WARC/1.0 +WARC-Type: request +WARC-Record-ID: +WARC-Target-URI: http://example.com/ +WARC-Date: 2017-03-06T04:03:48Z +WARC-Concurrent-To: +Content-Type: application/http; msgtype=request +Content-Length: 493 + +GET / HTTP/1.0 +Host: example.com +Accept-Language: en-US,en;q=0.8,ru;q=0.6 +Referer: https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/ +Upgrade-Insecure-Requests: 1 +Connection: close +User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 +Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 +Dnt: 1 +Accept-Encoding: gzip, deflate, sdch, br + +`) + + filepath := fmt.Sprintf("%s/test.warc", t.TempDir()) + file, err := os.Create(filepath) + if err != nil { + t.Fatalf("Failed to create testfile at '%s'", filepath) + } + // This is not strictly needed because of tmp, but to be platform agnostic it might be a good idea + defer file.Close() + + _, err = file.Write(testFileContent) + if err != nil { + t.Fatalf("Failed to write to testfile at '%s'", filepath) + } + + err = file.Sync() + if err != nil { + t.Fatalf("Failed to sync testfile at '%s'", filepath) + } + + tests := []struct { + writerFormat string + writer index.CdxWriter + }{ + { + "cdx", + &index.CdxLegacy{}, + }, + { + "cdxj", + &index.CdxJ{}, + }, + { + + "cdxpd", + &index.CdxPb{}, + }, + { + "db", + &index.CdxDb{}, + }, + } + + for _, tt := range tests { + testName := fmt.Sprintf("%T successfully indexes", tt.writer) + t.Run(testName, func(t *testing.T) { + c := &conf{ + filepath, + tt.writerFormat, + tt.writer, + } + c.writer.Init() + defer c.writer.Close() + + err := ReadFile(c) + if err != nil { + t.Errorf("Unexpected failure: %v", err) + } + + }) + } +} diff --git a/cmd/warcserver/cmd/index/warcdb/cdx-index/000000.vlog b/cmd/warcserver/cmd/index/warcdb/cdx-index/000000.vlog new file mode 100644 index 0000000..da67233 Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/cdx-index/000000.vlog differ diff --git a/cmd/warcserver/cmd/index/warcdb/cdx-index/000004.sst b/cmd/warcserver/cmd/index/warcdb/cdx-index/000004.sst new file mode 100644 index 0000000..26c0124 Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/cdx-index/000004.sst differ diff --git a/cmd/warcserver/cmd/index/warcdb/cdx-index/KEYREGISTRY b/cmd/warcserver/cmd/index/warcdb/cdx-index/KEYREGISTRY new file mode 100644 index 0000000..4355c2c --- /dev/null +++ b/cmd/warcserver/cmd/index/warcdb/cdx-index/KEYREGISTRY @@ -0,0 +1 @@ +]1IoH% bHello Badger \ No newline at end of file diff --git a/cmd/warcserver/cmd/index/warcdb/cdx-index/MANIFEST b/cmd/warcserver/cmd/index/warcdb/cdx-index/MANIFEST new file mode 100644 index 0000000..d9c092b Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/cdx-index/MANIFEST differ diff --git a/cmd/warcserver/cmd/index/warcdb/file-index/000000.vlog b/cmd/warcserver/cmd/index/warcdb/file-index/000000.vlog new file mode 100644 index 0000000..ce8443b Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/file-index/000000.vlog differ diff --git a/cmd/warcserver/cmd/index/warcdb/file-index/KEYREGISTRY b/cmd/warcserver/cmd/index/warcdb/file-index/KEYREGISTRY new file mode 100644 index 0000000..68dcdc3 --- /dev/null +++ b/cmd/warcserver/cmd/index/warcdb/file-index/KEYREGISTRY @@ -0,0 +1 @@ +3`W^=Hello Badger \ No newline at end of file diff --git a/cmd/warcserver/cmd/index/warcdb/file-index/MANIFEST b/cmd/warcserver/cmd/index/warcdb/file-index/MANIFEST new file mode 100644 index 0000000..0b55969 Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/file-index/MANIFEST differ diff --git a/cmd/warcserver/cmd/index/warcdb/id-index/000000.vlog b/cmd/warcserver/cmd/index/warcdb/id-index/000000.vlog new file mode 100644 index 0000000..1a091b1 Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/id-index/000000.vlog differ diff --git a/cmd/warcserver/cmd/index/warcdb/id-index/KEYREGISTRY b/cmd/warcserver/cmd/index/warcdb/id-index/KEYREGISTRY new file mode 100644 index 0000000..37c0ce6 --- /dev/null +++ b/cmd/warcserver/cmd/index/warcdb/id-index/KEYREGISTRY @@ -0,0 +1 @@ +/CěO+5nqHello Badger \ No newline at end of file diff --git a/cmd/warcserver/cmd/index/warcdb/id-index/MANIFEST b/cmd/warcserver/cmd/index/warcdb/id-index/MANIFEST new file mode 100644 index 0000000..0b55969 Binary files /dev/null and b/cmd/warcserver/cmd/index/warcdb/id-index/MANIFEST differ diff --git a/testdata/example-trunc.warc b/testdata/example-trunc.warc index 6a1e735..43ca9c2 100644 Binary files a/testdata/example-trunc.warc and b/testdata/example-trunc.warc differ diff --git a/testdata/example.warc b/testdata/example.warc index 4bc3089..41edf11 100644 Binary files a/testdata/example.warc and b/testdata/example.warc differ