diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala index 70fa26a1..8f770750 100644 --- a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala +++ b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala @@ -117,9 +117,9 @@ private[xml] object StaxXmlParserUtils { // So, we need to check further to decide if this is a data or just // a whitespace between them. parser.next - if (parser.peek.isStartElement) { - skipChildren(parser) - } + } + if (parser.peek.isStartElement) { + skipChildren(parser) } case _: EndElement => shouldStop = checkEndElement(parser) diff --git a/src/test/resources/cars-no-indentation.xml b/src/test/resources/cars-no-indentation.xml new file mode 100644 index 00000000..d603759d --- /dev/null +++ b/src/test/resources/cars-no-indentation.xml @@ -0,0 +1,2 @@ + +2012TeslaSNo comment1997FordE350Go get one now they are going fast2015ChevyVoltNo diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index f1d34893..497b4dae 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -43,6 +43,7 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll { val carsFile8859 = "src/test/resources/cars-iso-8859-1.xml" val carsFileGzip = "src/test/resources/cars.xml.gz" val carsFileBzip2 = "src/test/resources/cars.xml.bz2" + val carsNoIndentationFile = "src/test/resources/cars-no-indentation.xml" val carsMixedAttrNoChildFile = "src/test/resources/cars-mixed-attr-no-child.xml" val booksAttributesInNoChild = "src/test/resources/books-attributes-in-no-child.xml" val carsUnbalancedFile = "src/test/resources/cars-unbalanced-elements.xml" @@ -727,11 +728,16 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll { Seq( StructField("child", StringType, nullable = true), StructField("parent", nestedSchema, nullable = true))) - df.schema.printTreeString() - schema.printTreeString() assert(df.schema == schema) } + test("Skip and project currecntly XML files without indentation") { + val df = sqlContext.xmlFile(carsNoIndentationFile) + val results = df.select("model").collect() + val years = results.map(_.toSeq.head).toSet + assert(years == Set("S", "E350", "Volt")) + } + test("Select correctly all child fields regardless of pushed down projection") { val results = new XmlReader() .withRowTag("book") diff --git a/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala b/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala index 91e81df3..a9632ea9 100644 --- a/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtilsSuite.scala @@ -78,17 +78,20 @@ class StaxXmlParserUtilsSuite extends FunSuite with BeforeAndAfterAll { } test("Skip XML children") { - val input = 2 - Sam Mad Dog Smith19 + val input = + Sam Mad Dog Smith1 + 922 val reader = new ByteArrayInputStream(input.toString().getBytes) val parser = factory.createXMLEventReader(reader) // We assume here it's reading the value within `id` field. StaxXmlParserUtils.skipUntil(parser, XMLStreamConstants.CHARACTERS) StaxXmlParserUtils.skipChildren(parser) - assert( - parser.nextEvent().asEndElement().getName.getLocalPart == "id") + assert(parser.nextEvent().asEndElement().getName.getLocalPart == "info") + parser.next() + StaxXmlParserUtils.skipChildren(parser) + assert(parser.nextEvent().asEndElement().getName.getLocalPart == "abc") + parser.next() StaxXmlParserUtils.skipChildren(parser) - assert( - parser.nextEvent().asEndElement().getName.getLocalPart == "info") + assert(parser.nextEvent().asEndElement().getName.getLocalPart == "test") } }