diff --git a/docx.go b/docx.go index 9f70dc2..79842bd 100644 --- a/docx.go +++ b/docx.go @@ -3,13 +3,24 @@ package docconv import ( "archive/zip" "bytes" + "encoding/xml" "fmt" "io" "io/ioutil" - "regexp" "time" ) +type TypeOverride struct { + XMLName xml.Name `xml:"Override"` + ContentType string `xml:"ContentType,attr"` + PartName string `xml:"PartName,attr"` +} + +type Type struct { + XMLName xml.Name `xml:"Types"` + Overrides []TypeOverride `xml:"Override"` +} + // ConvertDocx converts an MS Word docx file to text. func ConvertDocx(r io.Reader) (string, map[string]string, error) { meta := make(map[string]string) @@ -24,13 +35,18 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) { return "", nil, fmt.Errorf("error unzipping data: %v", err) } - // Regular expression for XML files to include in the text parsing - reHeaderFile, _ := regexp.Compile("^word/header[0-9]+.xml$") - reFooterFile, _ := regexp.Compile("^word/footer[0-9]+.xml$") + zipFiles := mapZipFiles(zr.File) + + types, err := getContentTypes(zipFiles["[Content_Types].xml"]) + if err != nil { + return "", nil, err + } + + for _, override := range types.Overrides { + f := zipFiles[override.PartName] - for _, f := range zr.File { switch { - case f.Name == "docProps/core.xml": + case override.ContentType == "application/vnd.openxmlformats-package.core-properties+xml": rc, err := f.Open() if err != nil { return "", nil, fmt.Errorf("error opening '%v' from archive: %v", f.Name, err) @@ -52,32 +68,59 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) { meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) } } - - case f.Name == "word/document.xml": - textBody, err = parseDocxText(f) + case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml": + body, err := parseDocxText(f) if err != nil { return "", nil, err } - - case reHeaderFile.MatchString(f.Name): - header, err := parseDocxText(f) + textBody += body + "\n" + case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml": + footer, err := parseDocxText(f) if err != nil { return "", nil, err } - textHeader += header + "\n" - - case reFooterFile.MatchString(f.Name): - footer, err := parseDocxText(f) + textFooter += footer + "\n" + case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml": + header, err := parseDocxText(f) if err != nil { return "", nil, err } - textFooter += footer + "\n" + textHeader += header + "\n" } - } + } return textHeader + "\n" + textBody + "\n" + textFooter, meta, nil } +func getContentTypes(f *zip.File) (*Type, error) { + contentTypesFile, err := f.Open() + if err != nil { + return nil, err + } + defer contentTypesFile.Close() + + contentTypesFileBytes, err := ioutil.ReadAll(contentTypesFile) + if err != nil { + return nil, err + } + + var types Type + err = xml.Unmarshal(contentTypesFileBytes, &types) + if err != nil { + return nil, err + } + return &types, nil +} + +func mapZipFiles(files []*zip.File) map[string]*zip.File { + filesMap := map[string]*zip.File{} + for _, f := range files { + filesMap[f.Name] = f + filesMap["/"+f.Name] = f + } + return filesMap +} + func parseDocxText(f *zip.File) (string, error) { r, err := f.Open() if err != nil { diff --git a/docx_test/docx_test.go b/docx_test/docx_test.go new file mode 100644 index 0000000..7734d47 --- /dev/null +++ b/docx_test/docx_test.go @@ -0,0 +1,52 @@ +package docx_test + +import ( + "os" + "strings" + "testing" + + "code.sajari.com/docconv" + _ "code.sajari.com/docconv/docx_test/resources" +) + +func TestConvertDocx(t *testing.T) { + f, err := os.Open("./resources/sample.docx") + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + resp, _, err := docconv.ConvertDocx(f) + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + + if want := "Header"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "Footer"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "Content"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } +} + +func TestConvertDocxWithUncommonValidStructure(t *testing.T) { + f, err := os.Open("./resources/sample_2.docx") + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + resp, _, err := docconv.ConvertDocx(f) + if err != nil { + t.Fatalf("got error = %v, want nil", err) + } + + if want := "Header"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "Footer"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } + if want := "Content"; !strings.Contains(resp, want) { + t.Errorf("expected %v to contains %v", resp, want) + } +} diff --git a/docx_test/resources/package.go b/docx_test/resources/package.go new file mode 100644 index 0000000..18d6395 --- /dev/null +++ b/docx_test/resources/package.go @@ -0,0 +1 @@ +package resources diff --git a/docx_test/resources/sample.docx b/docx_test/resources/sample.docx new file mode 100644 index 0000000..16b1ac2 Binary files /dev/null and b/docx_test/resources/sample.docx differ diff --git a/docx_test/resources/sample_2.docx b/docx_test/resources/sample_2.docx new file mode 100644 index 0000000..ec71782 Binary files /dev/null and b/docx_test/resources/sample_2.docx differ