Skip to content

Commit

Permalink
Get docx contents reading [Content_Types].xml to get correct file names
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Machado Kraus authored and dhowden committed Sep 27, 2019
1 parent eedabc4 commit 83716c3
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 18 deletions.
79 changes: 61 additions & 18 deletions docx.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,24 @@ package docconv
import (
"archive/zip"
"bytes"
"encoding/xml"
"fmt"
"io"
"io/ioutil"
"regexp"
"time"
)

type TypeOverride struct {
XMLName xml.Name `xml:"Override"`
ContentType string `xml:"ContentType,attr"`
PartName string `xml:"PartName,attr"`
}

type Type struct {
XMLName xml.Name `xml:"Types"`
Overrides []TypeOverride `xml:"Override"`
}

// ConvertDocx converts an MS Word docx file to text.
func ConvertDocx(r io.Reader) (string, map[string]string, error) {
meta := make(map[string]string)
Expand All @@ -24,13 +35,18 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) {
return "", nil, fmt.Errorf("error unzipping data: %v", err)
}

// Regular expression for XML files to include in the text parsing
reHeaderFile, _ := regexp.Compile("^word/header[0-9]+.xml$")
reFooterFile, _ := regexp.Compile("^word/footer[0-9]+.xml$")
zipFiles := mapZipFiles(zr.File)

types, err := getContentTypes(zipFiles["[Content_Types].xml"])
if err != nil {
return "", nil, err
}

for _, override := range types.Overrides {
f := zipFiles[override.PartName]

for _, f := range zr.File {
switch {
case f.Name == "docProps/core.xml":
case override.ContentType == "application/vnd.openxmlformats-package.core-properties+xml":
rc, err := f.Open()
if err != nil {
return "", nil, fmt.Errorf("error opening '%v' from archive: %v", f.Name, err)
Expand All @@ -52,32 +68,59 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) {
meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix())
}
}

case f.Name == "word/document.xml":
textBody, err = parseDocxText(f)
case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml":
body, err := parseDocxText(f)
if err != nil {
return "", nil, err
}

case reHeaderFile.MatchString(f.Name):
header, err := parseDocxText(f)
textBody += body + "\n"
case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml":
footer, err := parseDocxText(f)
if err != nil {
return "", nil, err
}
textHeader += header + "\n"

case reFooterFile.MatchString(f.Name):
footer, err := parseDocxText(f)
textFooter += footer + "\n"
case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml":
header, err := parseDocxText(f)
if err != nil {
return "", nil, err
}
textFooter += footer + "\n"
textHeader += header + "\n"
}
}

}
return textHeader + "\n" + textBody + "\n" + textFooter, meta, nil
}

func getContentTypes(f *zip.File) (*Type, error) {
contentTypesFile, err := f.Open()
if err != nil {
return nil, err
}
defer contentTypesFile.Close()

contentTypesFileBytes, err := ioutil.ReadAll(contentTypesFile)
if err != nil {
return nil, err
}

var types Type
err = xml.Unmarshal(contentTypesFileBytes, &types)
if err != nil {
return nil, err
}
return &types, nil
}

func mapZipFiles(files []*zip.File) map[string]*zip.File {
filesMap := map[string]*zip.File{}
for _, f := range files {
filesMap[f.Name] = f
filesMap["/"+f.Name] = f
}
return filesMap
}

func parseDocxText(f *zip.File) (string, error) {
r, err := f.Open()
if err != nil {
Expand Down
52 changes: 52 additions & 0 deletions docx_test/docx_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package docx_test

import (
"os"
"strings"
"testing"

"code.sajari.com/docconv"
_ "code.sajari.com/docconv/docx_test/resources"
)

func TestConvertDocx(t *testing.T) {
f, err := os.Open("./resources/sample.docx")
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}
resp, _, err := docconv.ConvertDocx(f)
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}

if want := "Header"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "Footer"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "Content"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
}

func TestConvertDocxWithUncommonValidStructure(t *testing.T) {
f, err := os.Open("./resources/sample_2.docx")
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}
resp, _, err := docconv.ConvertDocx(f)
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}

if want := "Header"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "Footer"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "Content"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
}
1 change: 1 addition & 0 deletions docx_test/resources/package.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package resources
Binary file added docx_test/resources/sample.docx
Binary file not shown.
Binary file added docx_test/resources/sample_2.docx
Binary file not shown.

0 comments on commit 83716c3

Please sign in to comment.