diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cafda86 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/tmp/* +/brown/* +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index b209b68..15f99d9 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,10 @@ A letter (or letters) after the field tag indicates to output only those subfields. For example "907xz" means output subfield "x" and "z" in field "907". +The program supports a `format` parameter to output to other formats other than MARC line delimited (MRK) such as JSON or MARC binary. Notice that not all the features are avilable in all the formats yet. + +You can also pass `start` and `count` parameters to output only a range of MARC records. + ## Sample data Files under `./data/` are small MARC files that I use for testing. @@ -49,7 +53,7 @@ Download the code and play with it: git clone https://github.com/hectorcorrea/marcli.git cd marcli go build -./marcli -file data/test_1a.mrc +./marcli -file data/test_1a.mrc ``` diff --git a/dirEntry.go b/dirEntry.go deleted file mode 100644 index a9d7d0f..0000000 --- a/dirEntry.go +++ /dev/null @@ -1,41 +0,0 @@ -package main - -import ( - "errors" - "fmt" - "strconv" -) - -type DirEntry struct { - Tag string - Length int - StartsAt int - raw string -} - -func NewDirEntry(entry string) (DirEntry, error) { - if len(entry) != 12 { - return DirEntry{raw: entry}, errors.New("Incomplete field definition") - } - - length, _ := strconv.Atoi(entry[3:7]) - if length == 0 { - return DirEntry{raw: entry}, errors.New("Empty directory entry detected") - } - - startsAt, _ := strconv.Atoi(entry[7:]) - dir := DirEntry{ - Tag: entry[0:3], - Length: length, - StartsAt: startsAt, - raw: entry, - } - return dir, nil -} - -func (d DirEntry) String() string { - if d.Tag == "" { - return fmt.Sprintf("raw: %s", d.raw) - } - return fmt.Sprintf("tag: %s len: %d starts at: %d", d.Tag, d.Length, d.StartsAt) -} diff --git a/export/json.go b/export/json.go new file mode 100644 index 0000000..263dbed --- /dev/null +++ b/export/json.go @@ -0,0 +1,64 @@ +package export + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "marcli/marc" + "os" +) + +// TODO: Add support for JSONL (JSON line delimited) format that makes JSON +// easier to parse with Unix tools like grep, tail, and so on. +func ToJson(filename string, searchValue string, filters marc.FieldFilters, start int, count int) error { + if len(filters.Fields) > 0 { + return errors.New("filters not supported for this format") + } + + if count == 0 { + return nil + } + + file, err := os.Open(filename) + if err != nil { + return err + } + defer file.Close() + + var i, out int + marc := marc.NewMarcFile(file) + + fmt.Printf("[") + for marc.Scan() { + r, err := marc.Record() + if err == io.EOF { + break + } + if err != nil { + return err + } + if i++; i < start { + continue + } + if r.Contains(searchValue) { + if out > 0 { + fmt.Printf(",\r\n") + } else { + fmt.Printf("\r\n") + } + b, err := json.Marshal(r.Filter(filters)) + if err != nil { + fmt.Printf("%s\r\n", err) + } + // fmt.Printf("{ \"record\": %s}\r\n", b) + fmt.Printf("%s", b) + if out++; out == count { + break + } + } + } + fmt.Printf("\r\n]\r\n") + + return marc.Err() +} diff --git a/export/mrc.go b/export/mrc.go new file mode 100644 index 0000000..df55b25 --- /dev/null +++ b/export/mrc.go @@ -0,0 +1,49 @@ +package export + +import ( + "errors" + "fmt" + "io" + "marcli/marc" + "os" +) + +func ToMrc(filename string, searchValue string, filters marc.FieldFilters, start int, count int) error { + if len(filters.Fields) > 0 { + return errors.New("filters not supported for this format") + } + + if count == 0 { + return nil + } + + file, err := os.Open(filename) + if err != nil { + return err + } + defer file.Close() + + var i, out int + marc := marc.NewMarcFile(file) + for marc.Scan() { + r, err := marc.Record() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if i++; i < start { + continue + } + + if r.Contains(searchValue) { + fmt.Printf("%s", r.Raw()) + if out++; out == count { + break + } + } + } + return marc.Err() +} diff --git a/export/mrk.go b/export/mrk.go new file mode 100644 index 0000000..ed7c448 --- /dev/null +++ b/export/mrk.go @@ -0,0 +1,55 @@ +package export + +import ( + "fmt" + "io" + "marcli/marc" + "os" +) + +func ToMrk(filename string, searchValue string, filters marc.FieldFilters, start int, count int) error { + if count == 0 { + return nil + } + + file, err := os.Open(filename) + if err != nil { + return err + } + defer file.Close() + + var i, out int + marc := marc.NewMarcFile(file) + for marc.Scan() { + + r, err := marc.Record() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if i++; i < start { + continue + } + + if r.Contains(searchValue) { + str := "" + if filters.IncludeLeader() { + str += fmt.Sprintf("%s\r\n", r.Leader) + } + for _, field := range r.Filter(filters) { + str += fmt.Sprintf("%s\r\n", field) + } + if str != "" { + fmt.Printf("%s\r\n", str) + if out++; out == count { + break + } + } + } + } + + return marc.Err() +} diff --git a/export/solr.go b/export/solr.go new file mode 100644 index 0000000..8668899 --- /dev/null +++ b/export/solr.go @@ -0,0 +1,152 @@ +package export + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "marcli/marc" + "os" + "strings" +) + +type SolrDocument struct { + Id string `json:"id"` + Author string `json:"author_txt_en,omitempty"` + AuthorDate string `json:"authorDate_s,omitempty"` + AuthorFuller string `json:"authorFuller_txt_en,omitempty"` + AuthorsOther []string `json:"authorsOther_txts_en,omitempty"` + Title string `json:"title_txt_en,omitempty"` + Responsibility string `json:"responsibility_txt_en,omitempty"` + Publisher string `json:"publisher_txt_en,omitempty"` + Urls []string `json:"urls_ss,omitempty"` + Subjects []string `json:"subjects_txts_en,omitempty"` + SubjectsForm []string `json:"subjectsForm_txts_en,omitempty"` + SubjectsGeneral []string `json:"subjectsGeneral_txts_en,omitempty"` + SubjectsChrono []string `json:"subjectsChrono_txts_en,omitempty"` + SubjectsGeo []string `json:"subjectsGeo_txts_en,omitempty"` +} + +func NewSolrDocument(r marc.Record) SolrDocument { + doc := SolrDocument{} + id := r.GetValue("001", "") + if id == "" { + id = "INVALID" + } + doc.Id = strings.TrimSpace(id) + author := r.GetValue("100", "a") + if author != "" { + doc.Author = author + doc.AuthorDate = r.GetValue("100", "d") + doc.AuthorFuller = r.GetValue("100", "q") + } else { + doc.Author = r.GetValue("110", "a") + doc.AuthorDate = "" + doc.AuthorFuller = "" + } + doc.AuthorsOther = r.GetValues("700", "a") + + titleA := r.GetValue("245", "a") + titleB := r.GetValue("245", "b") + titleC := r.GetValue("245", "c") + doc.Title = concat(titleA, titleB) + doc.Responsibility = titleC + + doc.Publisher = r.GetValue("260", "a") + doc.Urls = r.GetValues("856", "u") + doc.Subjects = subjects(r, "a") + doc.SubjectsForm = subjects(r, "v") + doc.SubjectsGeneral = subjects(r, "x") + doc.SubjectsChrono = subjects(r, "y") + doc.SubjectsGeo = subjects(r, "z") + return doc +} + +func ToSolr(filename string, searchValue string, filters marc.FieldFilters, start int, count int) error { + if len(filters.Fields) > 0 { + return errors.New("filters not supported for this format") + } + + if count == 0 { + return nil + } + + file, err := os.Open(filename) + if err != nil { + return err + } + defer file.Close() + + var i, out int + marc := marc.NewMarcFile(file) + + fmt.Printf("[") + for marc.Scan() { + r, err := marc.Record() + if err == io.EOF { + break + } + if err != nil { + return err + } + if i++; i < start { + continue + } + if r.Contains(searchValue) { + if out > 0 { + fmt.Printf(",\r\n") + } else { + fmt.Printf("\r\n") + } + doc := NewSolrDocument(r) + b, err := json.Marshal(doc) + if err != nil { + fmt.Printf("%s\r\n", err) + } + fmt.Printf("%s", b) + if out++; out == count { + break + } + } + } + fmt.Printf("\r\n]\r\n") + + return marc.Err() +} + +func subjects(r marc.Record, subfield string) []string { + var values []string + for _, fieldValue := range r.GetValues("650", subfield) { + values = append(values, trimPeriod(fieldValue)) + } + return values +} + +func concat(a, b string) string { + return _concat(a, b, " ") +} + +func concatTab(a, b string) string { + return _concat(a, b, "\t") +} + +func _concat(a, b, sep string) string { + if a == "" && b == "" { + return "" + } else if a == "" && b != "" { + return strings.TrimSpace(b) + } else if a != "" && b == "" { + return strings.TrimSpace(a) + } + return strings.TrimSpace(a) + sep + strings.TrimSpace(b) +} + +func trimPeriod(s string) string { + if s == "" || s == "." { + return "" + } + if strings.HasSuffix(s, ".") { + return strings.TrimSpace(s[:len(s)-1]) + } + return s +} diff --git a/field.go b/field.go deleted file mode 100644 index 061e010..0000000 --- a/field.go +++ /dev/null @@ -1,191 +0,0 @@ -package main - -import ( - "fmt" - "strings" -) - -// Represents a single subfield value. -// For example in: -// =650 \0$aDiabetes$xComplications$zUnited States. -// an example of SubFieldValue will be: -// SubFieldValue{ -// SubField: "a", -// Value: "Diabetes" -// } -type SubFieldValue struct { - SubField string - Value string -} - -// Represents the entire value for a field. -// For example in: -// =650 \0$aDiabetes$xComplications$zUnited States. -// Field would be: -// Field{ -// Tag: "650", -// Ind1:" ", -// Ind2: "0", -// RawValue: "$aDiabetes$xComplications$zUnited States." -// SubFields (see SubFieldValue definition above) -// } -type Field struct { - Tag string - Ind1 string - Ind2 string - RawValue string // includes indicators and separator character - SubFields []SubFieldValue -} - -type Fields struct { - fields []Field -} - -func (v SubFieldValue) String() string { - return fmt.Sprintf("$%s%s", v.SubField, v.Value) -} - -func (f Fields) All() []Field { - return f.fields -} - -func (f *Fields) Add(field Field) { - f.fields = append(f.fields, field) -} - -func NewField(tag, valueStr string) Field { - value := Field{Tag: tag} - if tag <= "008" { - // Control fields (001-008) don't have indicators or subfields - // so we just get the value as-is. - value.RawValue = valueStr - return value - } - - // Process the indicators and subfields - if len(valueStr) >= 2 { - value.Ind1 = string(valueStr[0]) - value.Ind2 = string(valueStr[1]) - } - if len(valueStr) > 2 { - // notice that we skip the indicators [0] and [1] because they are handled - // above and valueStr[2] because that's a separator character - value.RawValue = valueStr[3:] - } - value.SubFields = NewSubFieldValues(valueStr) - return value -} - -func NewSubFieldValues(valueStr string) []SubFieldValue { - var values []SubFieldValue - // valueStr comes with the indicators, we skip them: - // valueStr[0] indicator 1 - // valueStr[1] indicator 2 - // valueStr[2] separator (ascii 31/0x1f) - separator := 0x1f - tokens := strings.Split(valueStr[3:], string(separator)) - for _, token := range tokens { - value := SubFieldValue{ - SubField: string(token[0]), - Value: token[1:], - } - values = append(values, value) - } - return values -} - -func (f Field) String() string { - ind1 := formatIndicator(f.Ind1) - ind2 := formatIndicator(f.Ind2) - strValue := "" - if len(f.SubFields) > 0 { - // use the subfield values - for _, s := range f.SubFields { - strValue += fmt.Sprintf("$%s%s", s.SubField, s.Value) - } - } else { - // use the raw value - strValue = f.RawValue - } - return fmt.Sprintf("=%s %s%s%s", f.Tag, ind1, ind2, strValue) -} - -func (f Field) SubFieldValue(subfield string) string { - for _, s := range f.SubFields { - if s.SubField == subfield { - return s.Value - } - } - return "" -} - -// For a given value, extract the subfield values in the string -// indicated. "subfields" is a plain string, like "abu", to -// indicate subfields a, b, and u. -func (f Field) SubFieldValues(subfields string) []SubFieldValue { - var values []SubFieldValue - for _, sub := range f.SubFields { - if strings.Contains(subfields, sub.SubField) { - value := SubFieldValue{ - SubField: sub.SubField, - Value: sub.Value, - } - values = append(values, value) - } - } - return values -} - -func formatIndicator(value string) string { - if value == " " { - return "\\" - } - return value -} - -func (f Fields) Get(tag string) []Field { - var fields []Field - for _, field := range f.fields { - if field.Tag == tag { - fields = append(fields, field) - } - } - return fields -} - -func (f Fields) GetOne(tag string) (bool, Field) { - fields := f.Get(tag) - if len(fields) == 0 { - return false, Field{} - } - return true, fields[0] -} - -func (f Fields) GetValue(tag string, subfield string) string { - value := "" - found, field := f.GetOne(tag) - if found { - if subfield == "" { - value = field.RawValue - } else { - value = field.SubFieldValue(subfield) - } - } - return value -} - -func (f Fields) GetValues(tag string, subfield string) []string { - var values []string - for _, field := range f.Get(tag) { - var value string - if subfield == "" { - value = field.RawValue - } else { - value = field.SubFieldValue(subfield) - } - if value != "" { - values = append(values, value) - } - } - return values -} diff --git a/leader.go b/leader.go deleted file mode 100644 index 5bc50eb..0000000 --- a/leader.go +++ /dev/null @@ -1,26 +0,0 @@ -package main - -import ( - "errors" - "fmt" - "strconv" -) - -type Leader struct { - raw string - Length int - DataOffset int -} - -func NewLeader(value string) (Leader, error) { - if len(value) != 24 { - return Leader{}, errors.New("Incomplete leader") - } - l, _ := strconv.Atoi(value[0:5]) - o, _ := strconv.Atoi(value[12:17]) - return Leader{raw: value, Length: l, DataOffset: o}, nil -} - -func (l Leader) String() string { - return fmt.Sprintf("=LDR %s", l.raw) -} diff --git a/main.go b/main.go index 1efdedf..30b6ba5 100644 --- a/main.go +++ b/main.go @@ -1,52 +1,48 @@ package main import ( + "errors" "flag" "fmt" + "marcli/export" + "marcli/marc" "strings" ) var fileName, search, fields, format string +var start, count int func init() { flag.StringVar(&fileName, "file", "", "MARC file to process. Required.") flag.StringVar(&search, "match", "", "Only records that match the string passed, case insensitive.") flag.StringVar(&fields, "fields", "", "Comma delimited list of fields to output.") - flag.StringVar(&format, "format", "mrk", "Output format. Accepted values: mrk, json, or solr.") + flag.StringVar(&format, "format", "mrk", "Output format. Accepted values: mrk, mrc, json, or solr.") + flag.IntVar(&start, "start", 1, "Number of first record to load") + flag.IntVar(&count, "count", -1, "Total number of records to load (-1 no limit)") + flag.Parse() } func main() { - if fileName == "" { fmt.Printf("marcli parameters:\r\n") flag.PrintDefaults() return } - - file, err := NewMarcFile(fileName) - if err != nil { - panic(err) - } - + var err error searchValue := strings.ToLower(search) - var processor Processor - if format == "brown" { - processor = ProcessorBrown{ - Filters: NewFieldFilters(fields), - } + filters := marc.NewFieldFilters(fields) + if format == "mrc" { + err = export.ToMrc(fileName, searchValue, filters, start, count) + } else if format == "mrk" { + err = export.ToMrk(fileName, searchValue, filters, start, count) + } else if format == "json" { + err = export.ToJson(fileName, searchValue, filters, start, count) } else if format == "solr" { - processor = ProcessorSolr{ - Filters: NewFieldFilters(fields), - } + err = export.ToSolr(fileName, searchValue, filters, start, count) } else { - processor = ConsoleProcessor{ - Filters: NewFieldFilters(fields), - Format: format, - } + err = errors.New("Invalid format") } - err = file.ReadAll(processor, searchValue) - if err != nil { panic(err) } diff --git a/marc/field.go b/marc/field.go new file mode 100644 index 0000000..b6525b9 --- /dev/null +++ b/marc/field.go @@ -0,0 +1,126 @@ +package marc + +import ( + "bytes" + "errors" + "fmt" + "strings" +) + +// Field represents a field inside a MARC record. Notice that the +// field could be a "control" field (tag 001-009) or a "data" field +// (any other tag) +// +// For example in: +// =650 \0$aDiabetes$xComplications$zUnited States. +// Field would be: +// Field{ +// Tag: "650", +// Value: "" +// Indicator1: " ", +// Indicator2: "0", +// SubFields (see SubField definition above) +// } +type Field struct { + Tag string // for both Control and Data fields + Value string // for Control fields + Indicator1 string // for Data fields + Indicator2 string // for Data fields + SubFields []SubField // for Data fields +} + +// SubField contains a Code and a Value. +// For example in: +// =650 \0$aDiabetes$xComplications$zUnited States. +// an example of SubFieldValue will be: +// SubField{ +// Code: "a", +// Value: "Diabetes" +// } +type SubField struct { + Code string + Value string +} + +// MakeField creates a field objet with the data received. +func MakeField(tag string, data []byte) (Field, error) { + f := Field{} + f.Tag = tag + + // It's a control field + if strings.HasPrefix(tag, "00") { + f.Value = string(data) + return f, nil + } + + if len(data) > 2 { + f.Indicator1 = string(data[0]) + f.Indicator2 = string(data[1]) + } else { + return f, errors.New("Invalid Indicators detected") + } + + for _, sf := range bytes.Split(data[3:], []byte{st}) { + if len(sf) > 0 { + f.SubFields = append(f.SubFields, SubField{string(sf[0]), string(sf[1:])}) + } else { + return f, errors.New("Extraneous field terminator") + } + } + return f, nil +} + +// IsControlField returns true if the field is a control field (tag 001-009) +func (f Field) IsControlField() bool { + return strings.HasPrefix(f.Tag, "00") +} + +// Contains returns true if the field contains the passed string. +func (f Field) Contains(str string) bool { + str = strings.ToLower(str) + if f.IsControlField() { + return strings.Contains(strings.ToLower(f.Value), str) + } + + for _, sub := range f.SubFields { + if strings.Contains(strings.ToLower(sub.Value), str) { + return true + } + } + return false +} + +func (f Field) String() string { + if f.IsControlField() { + return fmt.Sprintf("=%s %s", f.Tag, f.Value) + } + str := fmt.Sprintf("=%s %s%s", f.Tag, formatIndicator(f.Indicator1), formatIndicator(f.Indicator2)) + for _, sub := range f.SubFields { + str += fmt.Sprintf("$%s%s", sub.Code, sub.Value) + } + return str +} + +// GetSubFields returns an array of subfields that match the set of subfields +// indicated in the filter string. "filter" is a plain string, like "abu", to +// indicate what subfields are to be returned. +func (f Field) GetSubFields(filter string) []SubField { + values := []SubField{} + for _, sub := range f.SubFields { + if strings.Contains(filter, sub.Code) { + value := SubField{ + Code: sub.Code, + Value: sub.Value, + } + values = append(values, value) + } + } + return values +} + +func formatIndicator(value string) string { + if value == " " { + return "\\" + } + return value +} diff --git a/filters.go b/marc/filters.go similarity index 58% rename from filters.go rename to marc/filters.go index 8d95ac4..469eb23 100644 --- a/filters.go +++ b/marc/filters.go @@ -1,4 +1,4 @@ -package main +package marc import ( "errors" @@ -29,28 +29,20 @@ func NewFieldFilters(fieldsStr string) FieldFilters { } filters := FieldFilters{} for _, value := range strings.Split(fieldsStr, ",") { - filters.addFilter(value) - } - return filters -} - -func (filters FieldFilters) String() string { - s := "Filters {\r\n" - for _, field := range filters.Fields { - if field.Subfields == "" { - s += fmt.Sprintf("\tTag: %s\r\n", field.Tag) - } else { - s += fmt.Sprintf("\tTag: %s subfields: %s\r\n", field.Tag, field.Subfields) + filter, err := NewFieldFilter(value) + if err != nil { + // TODO: handle error + return FieldFilters{} } + filters.Fields = append(filters.Fields, filter) } - s += "}\r\n" - return s + return filters } // fieldStr is a string in the format NNNabc -func (filters *FieldFilters) addFilter(fieldStr string) error { +func NewFieldFilter(fieldStr string) (FieldFilter, error) { if len(fieldStr) < 3 { - return errors.New("Invalid field string (too short)") + return FieldFilter{}, errors.New("Invalid field string (too short)") } tag := fieldStr[0:3] subfields := "" @@ -58,36 +50,20 @@ func (filters *FieldFilters) addFilter(fieldStr string) error { subfields = fieldStr[3:] } filter := FieldFilter{Tag: tag, Subfields: subfields} - filters.Fields = append(filters.Fields, filter) - return nil + return filter, nil } -// For a given list of fields, it returns only those that -// match the filters. The filter is done by Tag and if -// available by Sub Field. -func (filters FieldFilters) Apply(fields Fields) Fields { - if len(filters.Fields) == 0 { - return fields - } - - filtered := Fields{} - for _, filter := range filters.Fields { - // Process all the fields that match the tag - // (there could be more than one) - for _, field := range fields.Get(filter.Tag) { - if len(filter.Subfields) == 0 { - // add the value as-is, no need to filter by subfield - filtered.Add(field) - } else { - //... filter the field by subfield - filteredField := field - filteredField.RawValue = "" - filteredField.SubFields = field.SubFieldValues(filter.Subfields) - filtered.Add(filteredField) - } +func (filters FieldFilters) String() string { + s := "Filters {\r\n" + for _, field := range filters.Fields { + if field.Subfields == "" { + s += fmt.Sprintf("\tTag: %s\r\n", field.Tag) + } else { + s += fmt.Sprintf("\tTag: %s subfields: %s\r\n", field.Tag, field.Subfields) } } - return filtered + s += "}\r\n" + return s } func (filters FieldFilters) IncludeField(name string) bool { @@ -106,11 +82,3 @@ func (filters FieldFilters) IncludeLeader() bool { } return filters.IncludeField("LDR") } - -func (filters FieldFilters) IncludeFileInfo() bool { - return filters.IncludeField("FIN") -} - -func (filters FieldFilters) IncludeRecordInfo() bool { - return filters.IncludeField("RIN") -} diff --git a/marc/leader.go b/marc/leader.go new file mode 100644 index 0000000..72f7333 --- /dev/null +++ b/marc/leader.go @@ -0,0 +1,51 @@ +package marc + +import ( + "errors" + "fmt" + "strconv" +) + +// Leader represents the leader of the MARC record. +type Leader struct { + raw []byte + dataOffset int + Status byte // 05 byte position + Type byte // 06 + BibLevel byte // 07 + Control byte // 08 + EncodingLevel byte // 17 + Form byte // 18 + Multipart byte // 19 +} + +// NewLeader creates a Leader from the data in the MARC record. +func NewLeader(bytes []byte) (Leader, error) { + if len(bytes) != 24 { + return Leader{}, errors.New("Incomplete leader") + } + + // length, _ := strconv.Atoi(string(bytes[0:5])) + offset, err := strconv.Atoi(string(bytes[12:17])) + if err != nil { + msg := fmt.Sprintf("Could not determine data offset from leader (%s)", string(bytes)) + return Leader{}, errors.New(msg) + } + + leader := Leader{ + raw: bytes, + dataOffset: offset, + Status: bytes[5], + Type: bytes[6], + BibLevel: bytes[7], + Control: bytes[8], + EncodingLevel: bytes[17], + Form: bytes[18], + Multipart: bytes[19], + } + return leader, nil +} + +func (l Leader) String() string { + return fmt.Sprintf("=LDR %s", string(l.raw)) +} diff --git a/marc/marcfile.go b/marc/marcfile.go new file mode 100644 index 0000000..a6b2574 --- /dev/null +++ b/marc/marcfile.go @@ -0,0 +1,104 @@ +package marc + +import ( + "bufio" + "bytes" + "errors" + "os" + "strconv" +) + +const ( + rt = 0x1d // End of record + st = 0x1f // End of subfield +) + +// MarcFile represents a MARC file. +// The public interface more or less mimic Go's native Scanner (Scan, Err, Text) +type MarcFile struct { + scanner *bufio.Scanner +} + +// NewMarcFile creates a scanner to manage reading the contents +// of the MARC file using Go's native Scanner interface. +// (stolen from https://github.com/MITLibraries/fml) +func NewMarcFile(file *os.File) MarcFile { + scanner := bufio.NewScanner(file) + + // By default Scanner.Scan() returns "bufio.Scanner: token too long" if + // the block to read is longer than 64K. Since MARC records can be up to + // 100K we use a custom value. See https://stackoverflow.com/a/37455465/446681 + initialBuffer := make([]byte, 0, 64*1024) + customMaxSize := 105 * 1024 + scanner.Buffer(initialBuffer, customMaxSize) + + scanner.Split(splitFunc) + return MarcFile{scanner: scanner} +} + +func splitFunc(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + if atEOF { + return len(data), data, nil + } + + if i := bytes.IndexByte(data, rt); i >= 0 { + return i + 1, data[0:i], nil + } + + return 0, nil, nil +} + +// Err returns the error in the scanner (if any) +func (file *MarcFile) Err() error { + return file.scanner.Err() +} + +// Scan moves the scanner to the next record. +// Returns false when no more records can be read. +func (file *MarcFile) Scan() bool { + return file.scanner.Scan() +} + +// Record returns the current Record in the MarcFile. +func (file *MarcFile) Record() (Record, error) { + bytes := file.scanner.Bytes() + rec := Record{} + rec.Data = append([]byte(nil), bytes...) + + leader, err := NewLeader(bytes[0:24]) + if err != nil { + return rec, err + } + rec.Leader = leader + + start := leader.dataOffset + data := bytes[start:] + dirs := bytes[24 : start-1] + + for len(dirs) >= 12 { + tag := string(dirs[:3]) + length, err := strconv.Atoi(string(dirs[3:7])) + if err != nil { + return rec, errors.New("Could not determine length of field") + } + begin, err := strconv.Atoi(string(dirs[7:12])) + if err != nil { + return rec, errors.New("Could not determine field start") + } + if len(data) <= begin+length-1 { + return rec, errors.New("Reported field length incorrect") + } + fdata := data[begin : begin+length-1] // length includes field terminator + df, err := MakeField(tag, fdata) + if err != nil { + return rec, err + } + rec.Fields = append(rec.Fields, df) + dirs = dirs[12:] + } + return rec, nil +} diff --git a/marc/record.go b/marc/record.go new file mode 100644 index 0000000..467b2c9 --- /dev/null +++ b/marc/record.go @@ -0,0 +1,130 @@ +package marc + +import ( + "fmt" + "strings" +) + +// Record is a struct representing a MARC record. It has a Fields slice +// which contains both ControlFields and DataFields. +type Record struct { + Data []byte + Fields []Field + Leader Leader +} + +// Contains returns true if Record contains the value passed. +func (r Record) Contains(searchValue string) bool { + if searchValue == "" { + return true + } + for _, field := range r.Fields { + if field.Contains(searchValue) { + return true + } + } + return false +} + +// ControlNum returns the control number (tag 001) for the record. +func (r Record) ControlNum() string { + for _, f := range r.Fields { + if f.Tag == "001" { + return f.Value + } + } + return "" +} + +func (r Record) Raw() []byte { + // Include the record terminator. + return append(r.Data, rt) +} + +func (r Record) String() string { + return fmt.Sprintf("Leader: %s", r.Leader) +} + +// Filter returns the fields in the record that match +// the given filter. +func (r Record) Filter(filters FieldFilters) []Field { + if len(filters.Fields) == 0 { + return r.Fields + } + + list := []Field{} + for _, filter := range filters.Fields { + // Get all the fields in the record that match the tag + // (there could be more than one) + for _, field := range r.FieldsByTag(filter.Tag) { + if len(filter.Subfields) == 0 { + // add the value as-is, no need to filter by subfield + list = append(list, field) + } else { + // extract the indicated subfields from the field + // before adding it to the list + filteredField := Field{ + Tag: field.Tag, + Value: field.Value, + Indicator1: field.Indicator1, + Indicator2: field.Indicator2, + SubFields: field.GetSubFields(filter.Subfields), + } + list = append(list, filteredField) + } + } + } + + return list +} + +// FieldsByTag returns an array with the fields in the record for the given tag +func (r Record) FieldsByTag(tag string) []Field { + var fields []Field + for _, field := range r.Fields { + if field.Tag == tag { + fields = append(fields, field) + } + } + return fields +} + +// GetValue returns the first value for a field tag/subfield combination. +func (r Record) GetValue(tag string, subfield string) string { + for _, field := range r.FieldsByTag(tag) { + if field.IsControlField() { + return field.Value + } + if subfield == "" { + // No subfield indicated, return the string version of the field + // TODO: Return the values rather than "=NNN \\ $aAAA $bBBB" + return field.String() + } + for _, sub := range field.SubFields { + if sub.Code == subfield { + // Return the first instance of the requested subfield + return sub.Value + } + } + } + return "" +} + +// GetValues returns the values that match the field tag/subfield combination. +func (r Record) GetValues(tag string, subfield string) []string { + values := []string{} + for _, field := range r.FieldsByTag(tag) { + if strings.TrimSpace(subfield) == "" { + // No subfield indicated, return the string version of the field + values = append(values, field.String()) + } else { + for _, sub := range field.SubFields { + if sub.Code == subfield { + // Return the first instance of the requested subfield + values = append(values, sub.Value) + } + } + } + } + return values +} diff --git a/marcfile.go b/marcfile.go deleted file mode 100644 index ae0c463..0000000 --- a/marcfile.go +++ /dev/null @@ -1,163 +0,0 @@ -package main - -import ( - "bufio" - "errors" - "fmt" - "io" - "os" -) - -type Processor interface { - ProcessRecord(*MarcFile, Record) - Header() - Footer() - Separator() -} - -type MarcFile struct { - Name string - f *os.File - records int - outputCount int - lastGoodRecord Record -} - -func NewMarcFile(filename string) (MarcFile, error) { - f, err := os.Open(filename) - if err != nil { - return MarcFile{}, err - } - return MarcFile{Name: filename, f: f, records: 0}, nil -} - -func (file *MarcFile) Close() { - file.f.Close() -} - -func (file *MarcFile) ReadAll(processor Processor, searchValue string) error { - processor.Header() - for { - record, err := file.readRecord(processor) - if err == io.EOF { - break - } - if err != nil { - return err - } - - file.records++ - - if record.IsMatch(searchValue) { - if file.outputCount > 0 { - processor.Separator() - } - processor.ProcessRecord(file, record) - file.outputCount++ - } - } - file.f.Close() - processor.Footer() - return nil -} - -func (file *MarcFile) readRecord(processor Processor) (Record, error) { - leader, err := file.readLeader() - if err != nil { - return Record{}, err - } - - directory, err := file.readDirectory() - if err != nil { - file.stopProcessing(err) - } - - fields := file.readValues(directory) - record := Record{ - Leader: leader, - Directory: directory, - Fields: fields, - Pos: file.records, - } - file.lastGoodRecord = record - return record, nil -} - -func (file *MarcFile) readLeader() (Leader, error) { - bytes := make([]byte, 24) - _, err := file.f.Read(bytes) - if err != nil { - return Leader{}, err - } - return NewLeader(string(bytes)) -} - -func (file *MarcFile) readDirectory() ([]DirEntry, error) { - const RecordSeparator = 0x1e - - // Source: https://www.socketloop.com/references/golang-bufio-scanrunes-function-example - offset := file.currentOffset() - reader := bufio.NewReader(file.f) - ss, err := reader.ReadString(RecordSeparator) - if err != nil { - return nil, err - } - count := (len(ss) - 1) / 12 - directory := make([]DirEntry, count) - for i := 0; i < count; i++ { - start := i * 12 - entry := ss[start : start+12] - field, err := NewDirEntry(entry) - if err != nil { - errMsg := fmt.Sprintf("%s (raw directory: %s)", err, ss) - return nil, errors.New(errMsg) - } - directory[i] = field - } - // ReadString leaves the file pointer a bit further than we want to. - // Force it to be exactly at the end of the directory. - file.f.Seek(offset+int64(len(ss)), 0) - return directory, nil -} - -func (file *MarcFile) currentOffset() int64 { - offset, _ := file.f.Seek(0, 1) - return offset -} - -func (file *MarcFile) readValues(directory []DirEntry) Fields { - var fields Fields - for _, entry := range directory { - buffer := make([]byte, entry.Length) - n, err := file.f.Read(buffer) - if err != nil && err != io.EOF { - file.stopProcessing(err) - } - if n <= 0 { - file.stopProcessing(errors.New("Value of length zero detected")) - } - value := string(buffer[:n-1]) // -1 to exclude the record separator character (0x1e) - field := NewField(entry.Tag, value) - fields.Add(field) - } - - eor := make([]byte, 1) - n, err := file.f.Read(eor) - if n != 1 { - file.stopProcessing(errors.New("End of record byte not found")) - } - - if err != nil { - file.stopProcessing(err) - } - return fields -} - -func (file *MarcFile) stopProcessing(err error) { - msg := fmt.Sprintf("Records processed: %d\r\n", file.records) - if file.records > 0 { - msg += fmt.Sprintf("Last record processed: %s\r\n", file.lastGoodRecord) - } - msg += fmt.Sprintf("Error: %s", err) - panic(msg) -} diff --git a/processorBrown.go b/processorBrown.go deleted file mode 100644 index e7482cc..0000000 --- a/processorBrown.go +++ /dev/null @@ -1,174 +0,0 @@ -package main - -import ( - "fmt" - "strings" -) - -type ProcessorBrown struct { - Filters FieldFilters - SearchValue string -} - -type BrownRecord struct { - Bib string - Title string - Items []BrownItem -} - -type BrownItem struct { - Callnumber string - Barcode string -} - -func NewBrownRecord(r Record) BrownRecord { - b := BrownRecord{} - b.Bib = bib(r) - b.Title = pad(r.Fields.GetValue("245", "a")) - b.Items = items(r) - return b -} - -func (p ProcessorBrown) Header() { - header := "" - if len(p.Filters.Fields) == 0 { - header = "bib\ttitle\tcallnumber\tbarcode" - } else { - header = p.outputString("bib", "title", "callnumber", "barcode") - } - fmt.Printf("%s\r\n", header) -} - -func (p ProcessorBrown) Footer() { -} - -func (p ProcessorBrown) ProcessRecord(f *MarcFile, r Record) { - b := NewBrownRecord(r) - if len(b.Items) == 0 { - // fmt.Printf("%s\t%s\t%s\r\n", b.Bib, b.Title, "--") - } else { - for _, item := range b.Items { - output := p.outputString(b.Bib, b.Title, item.Callnumber, item.Barcode) - fmt.Printf("%s\r\n", output) - } - } -} - -func (p ProcessorBrown) Separator() { -} - -func notEmpty(str string) string { - if len(str) == 0 { - return "-" - } - return str -} - -func (p ProcessorBrown) outputString(bib, title, callnumber, barcode string) string { - output := "" - allFields := len(p.Filters.Fields) == 0 - - if allFields || p.Filters.IncludeField("bib") { - output = notEmpty(bib) - } - if allFields || p.Filters.IncludeField("tit") { - output = concatTab(output, pad(notEmpty(title))) - } - if allFields || p.Filters.IncludeField("cal") { - output = concatTab(output, notEmpty(callnumber)) - } - if allFields || p.Filters.IncludeField("bar") { - output = concatTab(output, notEmpty(barcode)) - } - return output -} - -func bib(r Record) string { - bib := r.Fields.GetValue("907", "a") - if bib != "" { - bib = bib[1:(len(bib) - 1)] - } - return bib -} - -func baseCallNumber(r Record) (bool, Field) { - // 090 ab LC CALL NO(c) - if found, field := r.Fields.GetOne("090"); found { - return true, field - } - - // 091 ab HARRIS CALL NO(e) - if found, field := r.Fields.GetOne("091"); found { - return true, field - } - - // 092 ab JCB CALL NO(f) - if found, field := r.Fields.GetOne("092"); found { - return true, field - } - - // 096 ab SUDOCS CALL NO(v) - if found, field := r.Fields.GetOne("096"); found { - return true, field - } - - // 099 ab OTHER BROWN CALL (l) - if found, field := r.Fields.GetOne("099"); found { - return true, field - } - - return false, Field{} -} - -func barcode(f Field) string { - barcode := f.SubFieldValue("i") - barcode = removeSpaces(barcode) - if barcode == "" { - return "N/A" - } - return barcode -} - -func items(r Record) []BrownItem { - var items []BrownItem - - marcItems := r.Fields.Get("945") - if len(marcItems) == 0 { - return items - } - - // Base call number from the 09X field - found, f_090 := baseCallNumber(r) - if !found { - return items - } - - f_090a := f_090.SubFieldValue("a") - f_090b := f_090.SubFieldValue("b") - f_090f := f_090.SubFieldValue("f") // 1-SIZE - // get the call numbers from the items - for _, f_945 := range marcItems { - barcode := barcode(f_945) - base := concat3(f_090f, f_090a, f_090b) - f_945a := f_945.SubFieldValue("a") - f_945b := f_945.SubFieldValue("b") - if f_945a != "" { - // use the values in the item record - base = concat3(f_090f, f_945a, f_945b) - } - volume := f_945.SubFieldValue("c") - copy := f_945.SubFieldValue("g") - if copy == "1" { - copy = "" - } else if copy > "1" { - copy = "c. " + copy - } - number := concat3(base, volume, copy) - if strings.HasSuffix(number, "\\") { - number = number[0 : len(number)-1] - } - item := BrownItem{Callnumber: number, Barcode: barcode} - items = append(items, item) - } - return items -} diff --git a/processorSolr.go b/processorSolr.go deleted file mode 100644 index 4d3dfa0..0000000 --- a/processorSolr.go +++ /dev/null @@ -1,96 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" - "strings" -) - -type ProcessorSolr struct { - Filters FieldFilters - SearchValue string -} - -type SolrDocument struct { - Id string `json:"id"` - Author string `json:"author_txt_en,omitempty"` - AuthorDate string `json:"authorDate_s,omitempty"` - AuthorFuller string `json:"authorFuller_txt_en,omitempty"` - AuthorsOther []string `json:"authorsOther_txts_en,omitempty"` - Title string `json:"title_txt_en,omitempty"` - Responsibility string `json:"responsibility_txt_en,omitempty"` - Publisher string `json:"publisher_txt_en,omitempty"` - Urls []string `json:"urls_ss,omitempty"` - Subjects []string `json:"subjects_txts_en,omitempty"` - SubjectsForm []string `json:"subjectsForm_txts_en,omitempty"` - SubjectsGeneral []string `json:"subjectsGeneral_txts_en,omitempty"` - SubjectsChrono []string `json:"subjectsChrono_txts_en,omitempty"` - SubjectsGeo []string `json:"subjectsGeo_txts_en,omitempty"` -} - -func NewSolrDocument(r Record) SolrDocument { - doc := SolrDocument{} - id := r.Fields.GetValue("001", "") - if id == "" { - id = "INVALID" - } - doc.Id = strings.TrimSpace(id) - author := r.Fields.GetValue("100", "a") - if author != "" { - doc.Author = author - doc.AuthorDate = r.Fields.GetValue("100", "d") - doc.AuthorFuller = r.Fields.GetValue("100", "q") - } else { - doc.Author = r.Fields.GetValue("110", "a") - doc.AuthorDate = "" - doc.AuthorFuller = "" - } - doc.AuthorsOther = r.Fields.GetValues("700", "a") - - titleA := r.Fields.GetValue("245", "a") - titleB := r.Fields.GetValue("245", "b") - titleC := r.Fields.GetValue("245", "c") - doc.Title = concat(titleA, titleB) - doc.Responsibility = titleC - - doc.Publisher = r.Fields.GetValue("260", "a") - doc.Urls = r.Fields.GetValues("856", "u") - doc.Subjects = subjects(r, "a") - doc.SubjectsForm = subjects(r, "v") - doc.SubjectsGeneral = subjects(r, "x") - doc.SubjectsChrono = subjects(r, "y") - doc.SubjectsGeo = subjects(r, "z") - return doc -} - -func (p ProcessorSolr) Header() { - fmt.Printf("[\r\n") -} - -func (p ProcessorSolr) Footer() { - fmt.Printf("\r\n]\r\n") -} - -func (p ProcessorSolr) ProcessRecord(f *MarcFile, r Record) { - doc := NewSolrDocument(r) - str, err := json.Marshal(doc) - if err != nil { - fmt.Printf("%s\r\n", err) - } - fmt.Printf("%s", str) -} - -func (p ProcessorSolr) Separator() { - fmt.Printf(", \r\n") -} - -func subjects(r Record, subfield string) []string { - var values []string - for _, f_650 := range r.Fields.Get("650") { - value := f_650.SubFieldValue(subfield) - if value != "" { - values = append(values, trimPeriod(value)) - } - } - return values -} diff --git a/processors.go b/processors.go deleted file mode 100644 index b5316ec..0000000 --- a/processors.go +++ /dev/null @@ -1,71 +0,0 @@ -package main - -import ( - "encoding/json" - "fmt" -) - -type ConsoleProcessor struct { - Filters FieldFilters - SearchValue string - Format string -} - -func (p ConsoleProcessor) Header() { - if p.Format == "json" { - fmt.Printf("[\r\n") - } -} - -func (p ConsoleProcessor) Footer() { - if p.Format == "json" { - fmt.Printf("]\r\n") - } -} - -func (p ConsoleProcessor) ProcessRecord(f *MarcFile, r Record) { - if p.Format == "json" { - p.outputJson(r, f.Name) - } else { - p.outputMrk(r, f.Name) - } -} - -func (p ConsoleProcessor) Separator() { - if p.Format == "json" { - fmt.Printf(", \r\n") - } else { - fmt.Printf("\r\n") - } -} - -func (p ConsoleProcessor) outputMrk(r Record, filename string) { - str := "" - if p.Filters.IncludeLeader() { - str += fmt.Sprintf("%s\r\n", r.Leader) - } - if p.Filters.IncludeRecordInfo() { - str += fmt.Sprintf("=RIN pos=%d, length=%d, data offset=%d\r\n", r.Pos, r.Leader.Length, r.Leader.DataOffset) - } - if p.Filters.IncludeFileInfo() { - str += fmt.Sprintf("=FIN %s\r\n", filename) - } - filteredFields := p.Filters.Apply(r.Fields) - for _, field := range filteredFields.All() { - str += fmt.Sprintf("%s\r\n", field) - } - if str != "" { - fmt.Printf("%s\r\n", str) - } -} - -func (p ConsoleProcessor) outputJson(r Record, filename string) { - // TODO: Handle Leader, RecordInfo, and FileInfo fields - filteredFields := p.Filters.Apply(r.Fields) - b, err := json.Marshal(filteredFields.All()) - if err != nil { - fmt.Printf("%s\r\n", err) - } - // fmt.Printf("{ \"record\": %s}\r\n", b) - fmt.Printf("%s\r\n", b) -} diff --git a/record.go b/record.go deleted file mode 100644 index fbc9e80..0000000 --- a/record.go +++ /dev/null @@ -1,29 +0,0 @@ -package main - -import ( - "fmt" - "strings" -) - -type Record struct { - Leader Leader - Directory []DirEntry - Fields Fields - Pos int -} - -func (r Record) IsMatch(searchValue string) bool { - if searchValue == "" { - return true - } - for _, field := range r.Fields.All() { - if strings.Contains(strings.ToLower(field.RawValue), searchValue) { - return true - } - } - return false -} - -func (r Record) String() string { - return fmt.Sprintf("Leader: %s", r.Leader) -} diff --git a/utils.go b/utils.go deleted file mode 100644 index f63ab10..0000000 --- a/utils.go +++ /dev/null @@ -1,50 +0,0 @@ -package main - -import ( - "fmt" - "strings" -) - -func pad(str string) string { - if len(str) > 40 { - return str[0:40] - } - return fmt.Sprintf("%-40s", str) -} - -func concat(a, b string) string { - return _concat(a, b, " ") -} - -func concatTab(a, b string) string { - return _concat(a, b, "\t") -} - -func _concat(a, b, sep string) string { - if a == "" && b == "" { - return "" - } else if a == "" && b != "" { - return strings.TrimSpace(b) - } else if a != "" && b == "" { - return strings.TrimSpace(a) - } - return strings.TrimSpace(a) + sep + strings.TrimSpace(b) -} - -func concat3(a, b, c string) string { - return concat(concat(a, b), c) -} - -func removeSpaces(s string) string { - return strings.Replace(s, " ", "", -1) -} - -func trimPeriod(s string) string { - if s == "" || s == "." { - return "" - } - if strings.HasSuffix(s, ".") { - return strings.TrimSpace(s[:len(s)-1]) - } - return s -}