diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ef51b22..73b0e3b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,7 +11,7 @@ jobs: steps: - uses: actions/setup-go@v4 with: - go-version: 1.17 + go-version: 1.19 - name: Checkout uses: actions/checkout@v4 - name: golangci-lint diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f1dde0..bebb1a8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,5 +11,5 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: '^1.17.0' + go-version: '^1.19.0' - run: go test ./... diff --git a/doc.go b/doc.go index 7718c93..6914eb7 100644 --- a/doc.go +++ b/doc.go @@ -15,18 +15,32 @@ */ /* -Package gowarc allows parsing, creating and validating WARC-records. -Reading, writing and validating WARC-files is also supported. +Package gowarc provides a framework for handling WARC files, enabling their parsing, creation, and validation. -WARC +# WARC Overview The WARC format offers a standard way to structure, manage and store billions of resources collected from the web and elsewhere. It is used to build applications for harvesting, managing, accessing, mining and exchanging content. -To learn more about the WARC standard, read the specification at https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ +For more details, visit the WARC specification: https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ -Creating a WARC record +# WARC record creation -To create a WARC record. +The [WarcRecordBuilder], initialized via [NewRecordBuilder], is the primary tool for creating WARC records. +By default, the WarcRecordBuilder generates a record id and calculates the 'Content-Length' and 'WARC-Block-Digest'. + +Use [WarcFileWriter], initialized with [NewWarcFileWriter], to write WARC files. + +# WARC record parsing + +To parse single WARC records, use the [Unmarshaler] initialized with [NewUnmarshaler]. + +To read entire WARC files, employ the [WarcFileReader] initialized through [NewWarcFileReader]. + +# Validation and repair + +The gowarc package supports validation during both the creation and parsing of WARC records. +Control over the scope of validation and the handling of validation errors can be achieved by setting the appropriate +options in the [WarcRecordBuilder], [Unmarshaler], or [WarcFileReader]. */ package gowarc diff --git a/README.md b/docs/README.md similarity index 79% rename from README.md rename to docs/README.md index b4e4159..5a01d89 100644 --- a/README.md +++ b/docs/README.md @@ -1,12 +1,11 @@ ![Lint](https://github.com/nlnwa/gowarc/workflows/golangci-lint/badge.svg) -![GoReleaser](https://github.com/nlnwa/gowarc/workflows/goreleaser/badge.svg) +[![Release](https://img.shields.io/github/release/nlnwa/gowarc.svg)](https://github.com/nlnwa/gowarc/releases/latest) +[![License](https://img.shields.io/github/license/nlnwa/gowarc)](/LICENSE) [![PkgGoDev](https://pkg.go.dev/badge/github.com/nlnwa/gowarc)](https://pkg.go.dev/github.com/nlnwa/gowarc) -> This project is currently in alpha. Expect API changes and enhanced documentation to come. - # gowarc -A library for creating, parsing and evaluating WARC-records, written in go. +A library for creating, parsing and evaluating WARC-files, written in go. ### What is WARC? @@ -26,6 +25,8 @@ $ go get github.com/nlnwa/gowarc #### Create a new WARC record +To get you started, here is a simple example of how to create a new WARC record. + ```go package main @@ -54,16 +55,11 @@ func main() { } ``` -#### Expected output - -``` -WARC record: version: WARC/1.1, type: response, id: -``` - ### godoc For complete documentation and examples consult the godoc online at: https://pkg.go.dev/github.com/nlnwa/gowarc -## Command line -https://github.com/nlnwa/warchaeology is a command line tool that use gowarc. +## Command line tools + +[warchaeology](https://github.com/nlnwa/warchaeology) is a command line tool based on gowarc. diff --git a/example_test.go b/example_test.go index e2b6880..6387f36 100644 --- a/example_test.go +++ b/example_test.go @@ -14,26 +14,104 @@ * limitations under the License. */ -package gowarc +package gowarc_test -import "fmt" +import ( + "bufio" + "bytes" + "fmt" + "github.com/nlnwa/gowarc" + "io" +) -func Example_basic() { - builder := NewRecordBuilder(Response) +func ExampleNewRecordBuilder() { + builder := gowarc.NewRecordBuilder(gowarc.Response) _, err := builder.WriteString("HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" + "Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" + "Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content") if err != nil { panic(err) } - builder.AddWarcHeader(WarcRecordID, "") - builder.AddWarcHeader(WarcDate, "2006-01-02T15:04:05Z") - builder.AddWarcHeader(ContentLength, "257") - builder.AddWarcHeader(ContentType, "application/http;msgtype=response") - builder.AddWarcHeader(WarcBlockDigest, "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4") + builder.AddWarcHeader(gowarc.WarcRecordID, "") + builder.AddWarcHeader(gowarc.WarcDate, "2006-01-02T15:04:05Z") + builder.AddWarcHeader(gowarc.ContentLength, "257") + builder.AddWarcHeader(gowarc.ContentType, "application/http;msgtype=response") + builder.AddWarcHeader(gowarc.WarcBlockDigest, "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4") if wr, v, err := builder.Build(); err == nil { fmt.Println(wr, v) } // Output: WARC record: version: WARC/1.1, type: response, id: urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008 } + +func ExampleUnmarshaler() { + data := bytes.NewBufferString(" WARC/1.1\r\n" + + "WARC-Date: 2017-03-06T04:03:53Z\r\n" + + "WARC-Record-ID: \r\n" + + "WARC-Filename: temp-20170306040353.warc.gz\r\n" + + "WARC-Type: warcinfo\r\n" + + "Content-Type: application/warc-fields\r\n" + + "Warc-Block-Digest: sha1:AF4D582B4FFC017D07A947D841E392A821F754F3\r\n" + + "Content-Length: 34\r\n" + + "\r\n" + + "format: WARC File Format 1.1\r\n" + + "\r\n\r\n") + input := bufio.NewReader(data) + + // Create a new unmarshaler + unmarshaler := gowarc.NewUnmarshaler(gowarc.WithSpecViolationPolicy(gowarc.ErrWarn), gowarc.WithSyntaxErrorPolicy(gowarc.ErrWarn)) + wr, off, validation, err := unmarshaler.Unmarshal(input) + if err == nil { + fmt.Printf("Offset: %d, %s\n%s", off, wr, validation) + } + + // Output: Offset: 2, WARC record: version: WARC/1.1, type: warcinfo, id: urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008 + // gowarc: Validation errors: + // 1: gowarc: record was found 2 bytes after expected offset + // 2: block: wrong digest: expected sha1:AF4D582B4FFC017D07A947D841E392A821F754F3, computed: sha1:8A936F9FD60D664CF95B1FFB40F1C4093E65BB40 +} + +func ExampleNewWarcFileWriter() { + nameGenerator := &gowarc.PatternNameGenerator{Directory: "directory-name"} + + w := gowarc.NewWarcFileWriter(gowarc.WithFileNameGenerator(nameGenerator)) + defer func() { + w.Close() + }() + + builder := gowarc.NewRecordBuilder(gowarc.Response, gowarc.WithStrictValidation()) + _, err := builder.WriteString("HTTP/1.1 200 OK\r\nDate: Tue, 19 Sep 2016 17:18:40 GMT\r\nContent-Length: 19 ....") + if err != nil { + panic(err) + } + builder.AddWarcHeader(gowarc.WarcRecordID, "") + builder.AddWarcHeader(gowarc.WarcDate, "2006-01-02T15:04:05Z") + builder.AddWarcHeader(gowarc.ContentType, "application/http;msgtype=response") + + if wr, _, err := builder.Build(); err == nil { + w.Write(wr) + } +} + +func ExampleNewWarcFileReader() { + reader, err := gowarc.NewWarcFileReader("test.warc.gz", 0, gowarc.WithStrictValidation()) + if err != nil { + fmt.Println("Error creating warc reader:", err) + return + } + + for { + record, _, _, err := reader.Next() + if err == io.EOF { + break + } + if err != nil { + fmt.Println("Error reading record:", err) + return + } + fmt.Println("Record type:", record.Type().String()) + fmt.Println("Record version:", record.Version()) + // Do more with record as per needs + } + +} diff --git a/go.mod b/go.mod index a03c3c8..006de5b 100644 --- a/go.mod +++ b/go.mod @@ -16,8 +16,8 @@ require ( github.com/kr/pretty v0.3.1 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - golang.org/x/net v0.17.0 // indirect - golang.org/x/sys v0.13.0 // indirect - golang.org/x/text v0.13.0 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 418ec1b..ccbf2c6 100644 --- a/go.sum +++ b/go.sum @@ -1034,7 +1034,7 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20221012134737-56aed061732a/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw= -golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -1150,8 +1150,8 @@ golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfS golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= -golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1302,15 +1302,15 @@ golang.org/x/sys v0.0.0-20220919091848-fb04ddd9f9c8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1324,8 +1324,8 @@ golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/record.go b/record.go index 9d8cc71..31cfdcf 100644 --- a/record.go +++ b/record.go @@ -34,28 +34,52 @@ const ( crlfcrlf = "\r\n\r\n" // Carriage return, Newline, Carriage return, Newline ) +// WarcRecord is the interface implemented by types that can represent a WARC record. +// A new instance of WarcRecord is created by a [WarcRecordBuilder]. type WarcRecord interface { + // Version returns the WARC version of the record. Version() *WarcVersion + + // Type returns the WARC record type. Type() RecordType + + // WarcHeader returns the WARC header fields. WarcHeader() *WarcFields + + // Block returns the content block of the record. Block() Block + + // RecordId returns the WARC-Record-ID header field. RecordId() string + + // ContentLength returns the Content-Length header field. ContentLength() (int64, error) + + // Date returns the WARC-Date header field. Date() (time.Time, error) + + // String returns a string representation of the record. String() string + + // Closer closes the record and releases any resources associated with it. io.Closer + // ToRevisitRecord takes RevisitRef referencing the record we want to make a revisit of and returns a revisit record. ToRevisitRecord(ref *RevisitRef) (WarcRecord, error) - // RevisitRef extracts a RevisitRef current record if it is a revisit record. + + // RevisitRef extracts a RevisitRef from the current record if it is a revisit record. RevisitRef() (*RevisitRef, error) + // CreateRevisitRef creates a RevisitRef which references the current record. // - // The RevisitRef might be used by another records ToRevisitRecord to create a revisit record referencing this record. + // The RevisitRef might be used by another record's ToRevisitRecord to create a revisit record referencing this record. CreateRevisitRef(profile string) (*RevisitRef, error) + // Merge merges this record with its referenced record(s) // // It is implemented only for revisit records, but this function will be enhanced to also support segmented records. Merge(record ...WarcRecord) (WarcRecord, error) + // ValidateDigest validates block and payload digests if present. // // If option FixDigest is set, an invalid or missing digest will be corrected in the header. @@ -71,6 +95,10 @@ type WarcRecord interface { ValidateDigest(validation *Validation) error } +// WarcVersion represents a WARC specification version. +// +// For record creation, only WARC 1.0 and 1.1 are supported which are represented by the constants [V1_0] and [V1_1]. +// During parsing of a record, the WarcVersion will take on the version value found in the record itself. type WarcVersion struct { id uint8 txt string @@ -78,6 +106,7 @@ type WarcVersion struct { minor uint8 } +// String returns a string representation of the WARC version in the format used by WARC files i.e. 'WARC/1.0' or 'WARC/1.1'. func (v *WarcVersion) String() string { return "WARC/" + v.txt } @@ -96,8 +125,10 @@ var ( V1_1 = &WarcVersion{id: 2, txt: "1.1", major: 1, minor: 1} // WARC 1.1 ) +// RecordType represents the type of a WARC record. type RecordType uint16 +// String returns a string representation of the record type. func (rt RecordType) String() string { switch rt { case 1: diff --git a/unmarshaler.go b/unmarshaler.go index fbe1a9c..b33a5ba 100644 --- a/unmarshaler.go +++ b/unmarshaler.go @@ -25,10 +25,23 @@ import ( "io" ) +// Unmarshaler is the interface implemented by types that can unmarshal a WARC record. A new instance of Unmarshaler is created by calling [NewUnmarshaler]. +// NewUnmarshaler accepts a number of options that can be used to control the unmarshalling process. See [WarcRecordOption] for details. +// +// Unmarshal parses the WARC record from the given reader and returns: +// - The parsed WARC record. If an error occurred during the parsing, the returned WARC record might be nil. +// - The offset value indicating the number of characters that have been discarded until the start of a new record is found. +// - A pointer to a [Validation] object that stores any errors or warnings encountered during the parsing process. +// The validation object is only populated if the error specification is set to ErrWarn or ErrFail. +// - The standard error object in Go. If no error occurred during the parsing, this object is nil. Otherwise, it contains details about the encountered error. +// +// If the reader contains multiple records, Unmarshal parses the first record and returns. +// If the reader contains no records, Unmarshal returns an [io.EOF] error. type Unmarshaler interface { Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation, error) } +// unmarshaler implements the Unmarshaler interface. type unmarshaler struct { opts *warcRecordOptions warcFieldsParser *warcfieldsParser @@ -45,6 +58,7 @@ func NewUnmarshaler(opts ...WarcRecordOption) Unmarshaler { return u } +// Unmarshal implements the Unmarshal method in the Unmarshaler interface. func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation, error) { var r *bufio.Reader var offset int64 diff --git a/warcfile.go b/warcfile.go index 9b8624b..fe854c7 100644 --- a/warcfile.go +++ b/warcfile.go @@ -47,13 +47,13 @@ type WarcFileNameGenerator interface { // (https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#annex-c-informative-warc-file-size-and-name-recommendations). // The pattern is like golangs fmt package (https://pkg.go.dev/fmt), but allows for named fields in curly braces. // The available predefined names are: -// * prefix - content of the Prefix field -// * ext - content of the Extension field -// * ts - current time as 14-digit GMT Time-stamp -// * serial - atomically increased serial number for every generated file name. Initial value is 0 if Serial field is not set -// * ip - primary IP address of the node -// * host - host name of the node -// * hostOrIp - host name of the node, falling back to IP address if host name could not be resolved +// - prefix - content of the Prefix field +// - ext - content of the Extension field +// - ts - current time as 14-digit GMT Time-stamp +// - serial - atomically increased serial number for every generated file name. Initial value is 0 if Serial field is not set +// - ip - primary IP address of the node +// - host - host name of the node +// - hostOrIp - host name of the node, falling back to IP address if host name could not be resolved type PatternNameGenerator struct { Directory string // Directory to store warcfiles. Defaults to the empty string Prefix string // Prefix available to be used in pattern. Defaults to the empty string @@ -104,6 +104,13 @@ func (g *PatternNameGenerator) NewWarcfileName() (string, string) { return g.Directory, name } +// WarcFileWriter is used to write WARC files. +// Use [NewWarcFileWriter] to create a new instance. +// +// The WarcFileWriter writes to one or more files simultaneously. The number of files is controlled by the [WithMaxConcurrentWriters] option. +// The WarcFileWriter will create a new file when the current file size exceeds the value set by the [WithMaxFileSize] option. +// File names are generated by the [WarcFileNameGenerator] set by the [WithFileNameGenerator] option. +// The WarcFileWriter will add a Warcinfo record to each file if the [WithWarcInfoFunc] option is set. type WarcFileWriter struct { opts *warcFileWriterOptions writers []*singleWarcFileWriter @@ -460,6 +467,8 @@ func (w *singleWarcFileWriter) close() error { return nil } +// WarcFileReader is used to read WARC files. +// Use [NewWarcFileReader] to create a new instance. type WarcFileReader struct { file *os.File initialOffset int64 @@ -476,6 +485,9 @@ var inputBufPool = sync.Pool{ }, } +// NewWarcFileReader creates a new [WarcFileReader] from the supplied filename. +// If offset is > 0, the reader will start reading from that offset. +// The WarcFileReader can be configured with options. See [WarcRecordOption]. func NewWarcFileReader(filename string, offset int64, opts ...WarcRecordOption) (*WarcFileReader, error) { info, err := os.Stat(filename) if err != nil { @@ -509,23 +521,23 @@ func NewWarcFileReader(filename string, offset int64, opts ...WarcRecordOption) } // Next reads the next WarcRecord from the WarcFileReader. +// The method also provides the offset at which the record is found within the file. // -// Returned values depends on the errorPolicy options set on WarcFileReader: -// -// If set to ErrIgnore for all errors, a WarcRecord and its offset is returned without any validation. Error is only returned -// if the file is to bad to be able to parse anything meaningful. +// The validation and error values that Next produces depend on the errorPolicy options that have been set on the WarcFileReader: // -// If set to ErrWarn for all errors, the same as with ErrIgnore is returned, but record is validated and all validation -// errors are collected in a Validation object which can be examined. +// - [ErrIgnore]: This setting ignores all errors. A WarcRecord and its offset are returned without any validation. +// An error is only returned if the file is so badly formatted that nothing meaningful can be parsed. // -// If set to ErrFail for all errors, an error is returned in case of validation error and WarcRecord is nil. +// - [ErrWarn]: Similar to ErrIgnore, this setting returns a WarcRecord and its offset. +// However, the record is validated and all validation errors are collected in a Validation object which can then be examined. // -// If different errorPolicies are set for WithSyntaxErrorPolicy, WithSpecViolationPolicy and WithUnknownRecordTypePolicy, -// then a mix of the above return values are possible. +// - [ErrFail]: If this is set, the method will return an error in the case of a validation error, and WarcRecord might be nil. // -// WarcRecord will always be nil if error is returned. +// - Mixed Policies: It's possible to set different error policies for different types of errors with the following options: +// [WithSyntaxErrorPolicy], [WithSpecViolationPolicy] and [WithUnknownRecordTypePolicy]. +// The return values of Next would be a mix of the aforementioned scenarios based on the policies set. // -// When at end of file, returned offset is equal to length of file and err is io.EOF. +// When at end of file, returned offset is equal to length of file, WarcRecord is nil and err is [io.EOF]. func (wf *WarcFileReader) Next() (WarcRecord, int64, *Validation, error) { var validation *Validation if wf.currentRecord != nil { @@ -680,7 +692,7 @@ func WithOpenFileSuffix(suffix string) WarcFileWriterOption { // WithFileNameGenerator sets the WarcFileNameGenerator to use for generating new Warc file names. // -// defaults to defaultGenerator +// Default is to use a [PatternNameGenerator] with the default pattern. func WithFileNameGenerator(generator WarcFileNameGenerator) WarcFileWriterOption { return newFuncWarcFileOption(func(o *warcFileWriterOptions) { o.nameGenerator = generator @@ -720,13 +732,13 @@ func WithExpectedCompressionRatio(ratio float64) WarcFileWriterOption { // WithWarcInfoFunc sets a warcinfo-record generator function to be called for every new WARC-file created. // -// The function receives a WarcRecordBuilder which is prepopulated with WARC-Record-ID, WARC-Type, WARC-Date and Content-Type. +// The function receives a [WarcRecordBuilder] which is prepopulated with WARC-Record-ID, WARC-Type, WARC-Date and Content-Type. // After the submitted function returns, Content-Length and WARC-Block-Digest fields are calculated. // // When this option is set, records written to the warcfile will have the WARC-Warcinfo-ID automatically set to point // to the generated warcinfo record. // -// Use WithRecordOptions to modify the options used to create the WarcInfo record. +// Use [WithRecordOptions] to modify the options used to create the WarcInfo record. // // defaults nil (no generation of warcinfo record) func WithWarcInfoFunc(f func(recordBuilder WarcRecordBuilder) error) WarcFileWriterOption {