diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala
index 70fa26a1..8f770750 100644
--- a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala
+++ b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala
@@ -117,9 +117,9 @@ private[xml] object StaxXmlParserUtils {
// So, we need to check further to decide if this is a data or just
// a whitespace between them.
parser.next
- if (parser.peek.isStartElement) {
- skipChildren(parser)
- }
+ }
+ if (parser.peek.isStartElement) {
+ skipChildren(parser)
}
case _: EndElement =>
shouldStop = checkEndElement(parser)
diff --git a/src/test/resources/cars-no-indentation.xml b/src/test/resources/cars-no-indentation.xml
new file mode 100644
index 00000000..d603759d
--- /dev/null
+++ b/src/test/resources/cars-no-indentation.xml
@@ -0,0 +1,2 @@
+
+2012TeslaSNo comment
1997FordE350Go get one now they are going fast
2015ChevyVoltNo
diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
index f1d34893..497b4dae 100755
--- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
+++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -43,6 +43,7 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
val carsFile8859 = "src/test/resources/cars-iso-8859-1.xml"
val carsFileGzip = "src/test/resources/cars.xml.gz"
val carsFileBzip2 = "src/test/resources/cars.xml.bz2"
+ val carsNoIndentationFile = "src/test/resources/cars-no-indentation.xml"
val carsMixedAttrNoChildFile = "src/test/resources/cars-mixed-attr-no-child.xml"
val booksAttributesInNoChild = "src/test/resources/books-attributes-in-no-child.xml"
val carsUnbalancedFile = "src/test/resources/cars-unbalanced-elements.xml"
@@ -727,11 +728,16 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
Seq(
StructField("child", StringType, nullable = true),
StructField("parent", nestedSchema, nullable = true)))
- df.schema.printTreeString()
- schema.printTreeString()
assert(df.schema == schema)
}
+ test("Skip and project currecntly XML files without indentation") {
+ val df = sqlContext.xmlFile(carsNoIndentationFile)
+ val results = df.select("model").collect()
+ val years = results.map(_.toSeq.head).toSet
+ assert(years == Set("S", "E350", "Volt"))
+ }
+
test("Select correctly all child fields regardless of pushed down projection") {
val results = new XmlReader()
.withRowTag("book")
diff --git a/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala b/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala
index 91e81df3..a9632ea9 100644
--- a/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala
+++ b/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala
@@ -78,17 +78,20 @@ class StaxXmlParserUtilsSuite extends FunSuite with BeforeAndAfterAll {
}
test("Skip XML children") {
- val input = 2
- Sam Mad Dog Smith19
+ val input =
+ Sam Mad Dog Smith1
+ 922
val reader = new ByteArrayInputStream(input.toString().getBytes)
val parser = factory.createXMLEventReader(reader)
// We assume here it's reading the value within `id` field.
StaxXmlParserUtils.skipUntil(parser, XMLStreamConstants.CHARACTERS)
StaxXmlParserUtils.skipChildren(parser)
- assert(
- parser.nextEvent().asEndElement().getName.getLocalPart == "id")
+ assert(parser.nextEvent().asEndElement().getName.getLocalPart == "info")
+ parser.next()
+ StaxXmlParserUtils.skipChildren(parser)
+ assert(parser.nextEvent().asEndElement().getName.getLocalPart == "abc")
+ parser.next()
StaxXmlParserUtils.skipChildren(parser)
- assert(
- parser.nextEvent().asEndElement().getName.getLocalPart == "info")
+ assert(parser.nextEvent().asEndElement().getName.getLocalPart == "test")
}
}