diff --git a/libbeat/common/encoding/xml/safe_reader.go b/libbeat/common/encoding/xml/safe_reader.go
index 9c7c95c73c64..e49d8dc2b6c2 100644
--- a/libbeat/common/encoding/xml/safe_reader.go
+++ b/libbeat/common/encoding/xml/safe_reader.go
@@ -53,12 +53,22 @@ func (r *SafeReader) Read(d []byte) (n int, err error) {
return output(n)
}
if len(r.buf) == 0 {
- n, _ = r.inner.Read(r.backing[:])
+ n, _ := r.inner.Read(r.backing[:])
r.buf = r.backing[:n]
}
for i := 0; i < len(r.buf); {
+ if !utf8.FullRune(r.buf[i:]) {
+ n = copy(d, r.buf[:i])
+ rn := copy(r.backing[:], r.buf[i:])
+ nn, err := r.inner.Read(r.backing[rn:])
+ if err != nil && err != io.EOF {
+ return 0, err
+ }
+ r.buf = r.backing[:rn+nn]
+ return output(n)
+ }
code, size := utf8.DecodeRune(r.buf[i:])
- if !unicode.IsSpace(code) && unicode.IsControl(code) {
+ if code == utf8.RuneError || (!unicode.IsSpace(code) && unicode.IsControl(code)) {
n = copy(d, r.buf[:i])
r.buf = r.buf[n+1:]
r.code = []byte(fmt.Sprintf("\\u%04x", code))
diff --git a/libbeat/common/encoding/xml/safe_reader_test.go b/libbeat/common/encoding/xml/safe_reader_test.go
new file mode 100644
index 000000000000..18a93ec1a32c
--- /dev/null
+++ b/libbeat/common/encoding/xml/safe_reader_test.go
@@ -0,0 +1,139 @@
+// Licensed to Elasticsearch B.V. under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Elasticsearch B.V. licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//go:build !integration
+// +build !integration
+
+package xml
+
+import (
+ "encoding/xml"
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestInvalidXMLIsSanitized(t *testing.T) {
+ tests := []struct {
+ name string
+ input []byte
+ want map[string]interface{}
+ err error
+ }{
+ {
+ name: "control space",
+ input: []byte(`John` + "\t" + `
+`),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "John",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ {
+ name: "crlf",
+ input: []byte(`John` + "\r\n" + ``),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "John",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ {
+ name: "single invalid",
+ input: []byte(`John` + "\x80" + ``),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "John\\ufffd",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ {
+ name: "double invalid",
+ input: []byte(`` + "\x80" + `John` + "\x80" + ``),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "\\ufffdJohn\\ufffd",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ {
+ name: "happy single invalid",
+ input: []byte(`😊John` + "\x80" + ``),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "😊John\\ufffd",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ {
+ name: "invalid tag",
+ input: []byte(`John`),
+ want: nil,
+ err: &xml.SyntaxError{Msg: "attribute name without = in element", Line: 1},
+ },
+ {
+ name: "invalid tag value",
+ input: []byte(`John`),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "John",
+ "ID": "\\ufffd123",
+ },
+ },
+ },
+ },
+ {
+ name: "unhappy",
+ input: []byte(`John is` + strings.Repeat(" ", 223) + ` 😞`),
+ want: map[string]interface{}{
+ "person": map[string]interface{}{
+ "Name": map[string]interface{}{
+ "#text": "John is" + strings.Repeat(" ", 223) + " 😞",
+ "ID": "123",
+ },
+ },
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ d := NewDecoder(NewSafeReader(test.input))
+ out, err := d.Decode()
+ assert.Equal(t, test.err, err)
+ assert.Equal(t, test.want, out)
+ })
+ }
+}