diff --git a/.gitignore b/.gitignore index f1c181e..399f990 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,43 @@ +# Created by .ignore support plugin (hsz.mobi) +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* +### Windows template +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk +### Go template # Binaries for programs and plugins *.exe *.exe~ @@ -7,6 +47,67 @@ # Test binary, build with `go test -c` *.test +/vendor # Output of the go coverage tool, specifically when used with LiteIDE *.out +### Java template +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.nar +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/ + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a62b843 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,9 @@ +language: go + +go: +- 1.10.x +- 1.x + +script: +- go build +- go test -race -v ./... diff --git a/README.md b/README.md index ab876d4..857576f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,17 @@ # zipred -Golang library to filter and download files from within an online zip file on the fly + +[![Build Status](https://travis-ci.org/gofunky/zipred.svg)](https://travis-ci.org/gofunky/zipred) +[![GoDoc](https://godoc.org/github.com/gofunky/zipred?status.svg)](https://godoc.org/github.com/gofunky/zipred) +[![Go Report Card](https://goreportcard.com/badge/github.com/gofunky/zipred)](https://goreportcard.com/report/github.com/gofunky/zipred) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/7664447e93c742219959e310a1d3f2d9)](https://www.codacy.com/app/gofunky/zipred?utm_source=github.com&utm_medium=referral&utm_content=gofunky/zipred&utm_campaign=Badge_Grade) + +ZIP file operations can get costly, especially for large files. This library allows you to filter and extract an online zip file on the fly. + +In contrast to a conventional zip parser, it has the following benefits: +* There is less latency since data is processed directly from the buffer on the fly. +* The download can be stopped once the metadata or target file has been found. Hence, less data is transferred. +* Irrelevant data is directly discarded without memory allocation. + +This library gives you an efficient and idiomatic way for indexing zip files on the web. + +For examples, check the corresponding folder. diff --git a/examples/gitignore/get.go b/examples/gitignore/get.go new file mode 100644 index 0000000..6506aef --- /dev/null +++ b/examples/gitignore/get.go @@ -0,0 +1,70 @@ +package gitignore + +import ( + "errors" + "github.com/gofunky/zipred" + "os" + "strings" +) + +// gitIgnoreContent is the context that implements zipred.Zipred. +type gitIgnoreContent struct { + // patterns to filter + patterns []string + // number of patterns found + count int +} + +// URL to download the archive from +func (c *gitIgnoreContent) URL() string { + return archiveURL +} + +// Predicate indicates if the given file should be read or not. +// It is to return a zero string to discard the file, otherwise the key name. +// If error is nonempty, the download is aborted and the error is passed on. +func (c *gitIgnoreContent) Predicate(fileInfo os.FileInfo) (key string, err error) { + fileName := fileInfo.Name() + if strings.HasSuffix(fileName, gitignoreSuffix) { + alias := strings.ToLower(strings.TrimSuffix(fileName, gitignoreSuffix)) + for _, pat := range c.patterns { + if strings.ToLower(pat) == alias { + c.count++ + return alias, nil + } + } + } + return +} + +// Done indicates if enough data has been read and the download can be aborted ahead of the EOF. +// isEOF is true if the end of the zip file has been reached. +// If error is nonempty, the download is aborted and the error is passed on. +func (c *gitIgnoreContent) Done(isEOF bool) (finish bool, err error) { + if c.count == len(c.patterns) { + return true, nil + } else if isEOF { + return true, errors.New("not all given gitignore patterns could be found") + } + return +} + +// Get the given gitignore patterns. +func Get(patterns []string) (files map[string][]byte, err error) { + context := &gitIgnoreContent{ + patterns: patterns, + } + return zipred.FilterZipContent(context) +} + +// GetAll available gitignore patterns. +func GetAll() (files map[string][]byte, err error) { + allPatterns, err := List() + if err != nil { + return nil, err + } + context := &gitIgnoreContent{ + patterns: allPatterns, + } + return zipred.FilterZipContent(context) +} diff --git a/examples/gitignore/get_test.go b/examples/gitignore/get_test.go new file mode 100644 index 0000000..7a01445 --- /dev/null +++ b/examples/gitignore/get_test.go @@ -0,0 +1,73 @@ +package gitignore + +import ( + "testing" +) + +func TestGet(t *testing.T) { + type args struct { + patterns []string + } + tests := []struct { + name string + args args + wantFiles []string + wantErr bool + }{ + { + name: "Check some patterns", + args: args{[]string{"go", "java"}}, + wantFiles: []string{"go", "java"}, + }, + { + name: "Invalid patterns", + args: args{[]string{"invalid"}}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotFiles, err := Get(tt.args.patterns) + if (err != nil) != tt.wantErr { + t.Errorf("Get() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + for _, pattern := range tt.wantFiles { + if val, ok := gotFiles[pattern]; !ok || len(val) == 0 { + t.Errorf("Get() is missing key %s or it is empty", pattern) + } + } + } + }) + } +} + +func TestGetAll(t *testing.T) { + tests := []struct { + name string + wantFiles []string + wantErr bool + }{ + { + name: "Check some patterns", + wantFiles: []string{"go", "java"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotFiles, err := GetAll() + if (err != nil) != tt.wantErr { + t.Errorf("GetAll() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + for _, pattern := range tt.wantFiles { + if val, ok := gotFiles[pattern]; !ok || len(val) == 0 { + t.Errorf("GetAll() is missing key %s or it is empty", pattern) + } + } + } + }) + } +} diff --git a/examples/gitignore/gitignore.go b/examples/gitignore/gitignore.go new file mode 100644 index 0000000..3a05eee --- /dev/null +++ b/examples/gitignore/gitignore.go @@ -0,0 +1,6 @@ +package gitignore + +const ( + gitignoreSuffix = ".gitignore" + archiveURL = "https://github.com/dvcs/gitignore/archive/master.zip" +) diff --git a/examples/gitignore/list.go b/examples/gitignore/list.go new file mode 100644 index 0000000..577995f --- /dev/null +++ b/examples/gitignore/list.go @@ -0,0 +1,38 @@ +package gitignore + +import ( + "github.com/gofunky/zipred" + "os" + "strings" +) + +// gitIgnoreList is the context that implements zipred.Zipred. +type gitIgnoreList struct{} + +// URL to download the archive from +func (c *gitIgnoreList) URL() string { + return archiveURL +} + +// Predicate indicates if the given file should be read or not. +// It is to return a zero string to discard the file, otherwise the key name. +// If error is nonempty, the download is aborted and the error is passed on. +func (c *gitIgnoreList) Predicate(fileInfo os.FileInfo) (key string, err error) { + fileName := fileInfo.Name() + if strings.HasSuffix(fileName, gitignoreSuffix) { + return strings.ToLower(strings.TrimSuffix(fileName, gitignoreSuffix)), nil + } + return +} + +// Done indicates if enough data has been read and the download can be aborted ahead of the EOF. +// isEOF is true if the end of the zip file has been reached. +// If error is nonempty, the download is aborted and the error is passed on. +func (c *gitIgnoreList) Done(isEOF bool) (finish bool, err error) { + return +} + +// List all available gitignore patterns. +func List() (patterns []string, err error) { + return zipred.FilterFileInfo(&gitIgnoreList{}) +} diff --git a/examples/gitignore/list_test.go b/examples/gitignore/list_test.go new file mode 100644 index 0000000..cd23e9b --- /dev/null +++ b/examples/gitignore/list_test.go @@ -0,0 +1,41 @@ +package gitignore + +import ( + "testing" +) + +func TestList(t *testing.T) { + tests := []struct { + name string + wantPatterns []string + wantErr bool + }{ + { + name: "Check some patterns", + wantPatterns: []string{"go", "java"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotPatterns, err := List() + if (err != nil) != tt.wantErr { + t.Errorf("List() error = %v, wantErr %v", err, tt.wantErr) + return + } + for _, want := range tt.wantPatterns { + if !contains(gotPatterns, want) { + t.Errorf("List() = %v, want %v", gotPatterns, tt.wantPatterns) + } + } + }) + } +} + +func contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/zipred.go b/zipred.go new file mode 100644 index 0000000..fa38b07 --- /dev/null +++ b/zipred.go @@ -0,0 +1,163 @@ +package zipred + +import ( + "errors" + "github.com/francoispqt/onelog" + "github.com/gofunky/zipstream" + "io" + "io/ioutil" + "net/http" + "os" +) + +var log *onelog.Logger + +func init() { + log = onelog.New( + os.Stdout, + onelog.WARN|onelog.ERROR|onelog.FATAL, + ) +} + +// UseLogger sets a custom Logger for the package. +func UseLogger(logger *onelog.Logger) { + log = logger +} + +const ( + errorConst = "error" +) + +// Zipred allows the implementation of a context that dynamically defines the boundaries of the zip filtering. +type Zipred interface { + // URL to download the archive from + URL() string + // Predicate indicates if the given file should be read or not. + // It is to return a zero string to discard the file, otherwise the key name. + // If error is nonempty, the download is aborted and the error is passed on. + Predicate(fileInfo os.FileInfo) (key string, err error) + // Done indicates if enough data has been read and the download can be aborted ahead of the EOF. + // isEOF is true if the end of the zip file has been reached. + // If error is nonempty, the download is aborted and the error is passed on. + Done(isEOF bool) (finish bool, err error) +} + +// FilterFileInfo filters the given URL's zip file and returns all file names on which the given predicate applies. +func FilterFileInfo(z Zipred) (keys []string, err error) { + keys = make([]string, 0) + err = parseFiles(z, func(reader *zipstream.Reader, predKey string) (storeErr error) { + keys = append(keys, predKey) + return nil + }) + if err != nil { + log.ErrorWith("the archive filtering failed"). + Err(errorConst, err).Write() + return nil, err + } + log.Info("the archive's files were successfully downloaded and filtered") + return keys, nil +} + +// FilterZipContent filters the given URL's zip file and returns all file contents on which the given predicate applies. +func FilterZipContent(z Zipred) (files map[string][]byte, err error) { + files = make(map[string][]byte, 0) + err = parseFiles(z, func(reader *zipstream.Reader, predKey string) (storeErr error) { + err = readNext(predKey, reader, files) + if err != nil { + log.ErrorWith("the given zipped file could not be read"). + String("file", predKey).Err(errorConst, err).Write() + return err + } + return nil + }) + if err != nil { + log.ErrorWith("the archive filtering failed"). + Err(errorConst, err).Write() + return nil, err + } + log.Info("the archive's files were successfully downloaded and filtered") + return files, nil +} + +// parseFiles downloads the given URL, parses the zip and passes the zip reader and predicate key to the given func. +func parseFiles(z Zipred, store func(reader *zipstream.Reader, predKey string) (storeErr error)) (useErr error) { + return downloadFiles(z.URL(), func(data io.ReadCloser) (useErr error) { + zipReader := zipstream.NewReader(data) + for { + header, useErr := zipReader.Next() + if useErr == io.EOF { + _, useErr := z.Done(true) + if useErr != nil { + log.ErrorWith("the archive parser has reached an erroneous state"). + Bool("isEOF", true).Err(errorConst, useErr).Write() + return useErr + } + return nil + } else if useErr != nil { + log.ErrorWith("the downloading archive could not be read"). + Err(errorConst, useErr).Write() + return useErr + } + predKey, useErr := z.Predicate(header.FileInfo()) + if useErr != nil { + log.ErrorWith("the given predicate failed"). + Err(errorConst, useErr).Write() + return useErr + } + if predKey != "" { + useErr := store(zipReader, predKey) + if useErr != nil { + log.ErrorWith("the archive parser could not store the result"). + Err(errorConst, useErr).Write() + return useErr + } + } + done, useErr := z.Done(false) + if useErr != nil { + log.ErrorWith("the archive parser has reached an erroneous state"). + Err(errorConst, useErr).Write() + return useErr + } + if done { + return nil + } + } + }) +} + +// downloadFiles downloads the given URL and passes the resulting reader to the given usage func. +func downloadFiles(URL string, usage func(data io.ReadCloser) (useErr error)) (err error) { + // URL must not be empty + if URL == "" { + log.ErrorWith("the given URL is empty"). + String("url", URL).Write() + return errors.New("URL is empty") + } + + // Download file ony the fly + resp, err := http.Get(URL) + if err != nil { + log.ErrorWith("the given archive archive could not be downloaded"). + String("url", URL).Err(errorConst, err).Write() + return err + } + defer resp.Body.Close() + + // Apply predicates + err = usage(resp.Body) + + return err +} + +// readNext reads the next template and stores it in the given target map. +func readNext(key string, reader io.Reader, target map[string][]byte) (err error) { + if key == "" { + return errors.New("empty key was given") + } + content, err := ioutil.ReadAll(reader) + if err != nil { + return err + } + target[key] = content + return nil +} diff --git a/zipred_test.go b/zipred_test.go new file mode 100644 index 0000000..7790bd2 --- /dev/null +++ b/zipred_test.go @@ -0,0 +1,127 @@ +package zipred + +import ( + "io" + "reflect" + "testing" + + "bytes" + "io/ioutil" +) + +func Test_downloadFiles(t *testing.T) { + type args struct { + URL string + usage func(data io.ReadCloser) (useErr error) + } + tests := []struct { + name string + args args + wantErr bool + }{ + { + name: "Download empty", + args: args{ + URL: "invalid", + usage: func(data io.ReadCloser) error { + return nil + }, + }, + wantErr: true, + }, + { + name: "Download custom", + args: args{ + URL: "https://github.com/gofunky/zipred/archive/master.zip", + usage: func(data io.ReadCloser) error { + _, err := ioutil.ReadAll(data) + if err != nil { + return err + } + return nil + }, + }, + }, + { + name: "Check download fail", + args: args{ + URL: "invalid", + usage: func(data io.ReadCloser) error { + return nil + }, + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := downloadFiles(tt.args.URL, tt.args.usage); (err != nil) != tt.wantErr { + t.Errorf("downloadFiles() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func Test_readNext(t *testing.T) { + firstKey := "first" + firstContent := "wins\n" + secondKey := "second" + secondContent := "loses\t" + type fields struct { + URL string + SelectedTemplates map[string]bool + AdditionalRules []string + fetchedTemplates map[string][]byte + } + type args struct { + reader io.Reader + key string + } + tests := []struct { + name string + fields fields + args []args + wantErr bool + wantTarget map[string][]byte + }{ + { + name: "Single parse", + args: []args{{bytes.NewBufferString(firstContent), firstKey}}, + wantTarget: map[string][]byte{firstKey: []byte(firstContent)}, + }, + { + name: "Dual parse", + args: []args{ + {bytes.NewBufferString(firstContent), firstKey}, + {bytes.NewBufferString(firstContent), firstKey}, + }, + wantTarget: map[string][]byte{firstKey: []byte(firstContent)}, + }, + { + name: "Dual parse different keys", + args: []args{ + {bytes.NewBufferString(firstContent), firstKey}, + {bytes.NewBufferString(secondContent), secondKey}, + }, + wantTarget: map[string][]byte{firstKey: []byte(firstContent), secondKey: []byte(secondContent)}, + }, + { + name: "Without a valid key", + args: []args{{bytes.NewBufferString(firstContent), ""}}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + target := make(map[string][]byte, len(tt.args)) + for _, arg := range tt.args { + if err := readNext(arg.key, arg.reader, target); (err != nil) != tt.wantErr { + t.Errorf("readNext() error = %v, wantErr %v", err, tt.wantErr) + } + } + if !reflect.DeepEqual(target, tt.wantTarget) && !tt.wantErr { + t.Errorf("readNext() target = %v, wantTarget %v", target, tt.wantTarget) + } + }) + } +}