Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Snowflake Plugin #818

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@
<artifactId>mongo-spark-connector_${scala.binary.version}</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>net.snowflake</groupId>
<artifactId>spark-snowflake_${scala.binary.version}</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2021 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.spline.harvester.plugin.embedded

import za.co.absa.spline.commons.reflect.ReflectionUtils.extractValue
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{LogicalRelation, SaveIntoDataSourceCommand}
import org.apache.spark.sql.sources.BaseRelation
import za.co.absa.spline.commons.reflect.extractors.SafeTypeMatchingExtractor
import za.co.absa.spline.harvester.builder.SourceIdentifier
import za.co.absa.spline.harvester.plugin.Plugin.{Precedence, ReadNodeInfo, WriteNodeInfo}
import za.co.absa.spline.harvester.plugin.embedded.SnowflakePlugin._
import za.co.absa.spline.harvester.plugin.{BaseRelationProcessing, Plugin, RelationProviderProcessing}

import javax.annotation.Priority
import scala.language.reflectiveCalls

@Priority(Precedence.Normal)
class SnowflakePlugin(spark: SparkSession)
extends Plugin
with BaseRelationProcessing
with RelationProviderProcessing {

import za.co.absa.spline.commons.ExtractorImplicits._

override def baseRelationProcessor: PartialFunction[(BaseRelation, LogicalRelation), ReadNodeInfo] = {
case (`_: SnowflakeRelation`(r), _) =>
val params = extractValue[net.snowflake.spark.snowflake.Parameters.MergedParameters](r, "params")

val url: String = params.sfURL
val warehouse: String = params.sfWarehouse.getOrElse("")
val database: String = params.sfDatabase
val schema: String = params.sfSchema
val table: String = params.table.getOrElse("").toString

ReadNodeInfo(asSourceId(url, warehouse, database, schema, table), Map.empty)
}

override def relationProviderProcessor: PartialFunction[(AnyRef, SaveIntoDataSourceCommand), WriteNodeInfo] = {
case (rp, cmd) if rp == "net.snowflake.spark.snowflake.DefaultSource" || SnowflakeSourceExtractor.matches(rp) =>
val url: String = cmd.options("sfUrl")
val warehouse: String = cmd.options("sfWarehouse")
val database: String = cmd.options("sfDatabase")
val schema: String = cmd.options("sfSchema")
val table: String = cmd.options("dbtable")

WriteNodeInfo(asSourceId(url, warehouse, database, schema, table), cmd.mode, cmd.query, cmd.options) }
}

object SnowflakePlugin {

private object `_: SnowflakeRelation` extends SafeTypeMatchingExtractor[AnyRef]("net.snowflake.spark.snowflake.SnowflakeRelation")

private object SnowflakeSourceExtractor extends SafeTypeMatchingExtractor(classOf[net.snowflake.spark.snowflake.DefaultSource])

private def asSourceId(url: String, warehouse: String, database: String, schema: String, table: String) =
SourceIdentifier(Some("snowflake"), s"snowflake://$url.$warehouse.$database.$schema.$table")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright 2021 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.spline.harvester.plugin.embedded
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing copyright/license header. That's the reason why all test builds fail.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding this!


import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.execution.datasources.{LogicalRelation, SaveIntoDataSourceCommand}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers
import org.scalatestplus.mockito.MockitoSugar
import za.co.absa.spline.harvester.plugin.Plugin.{ReadNodeInfo, WriteNodeInfo}
import za.co.absa.spline.harvester.builder.SourceIdentifier
import org.mockito.Mockito.{mock, _}
import net.snowflake.spark.snowflake.Parameters
import net.snowflake.spark.snowflake.Parameters.MergedParameters
import org.apache.spark.sql.sources.BaseRelation
import za.co.absa.spline.commons.reflect.extractors.SafeTypeMatchingExtractor
import za.co.absa.spline.commons.reflect.{ReflectionUtils, ValueExtractor}

class SnowflakePluginSpec extends AnyFlatSpec with Matchers with MockitoSugar {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any possibility to add end-to-end integration tests to test compatibility with the real Spark?
Like, for example a Snowflake docker container or maybe official mock/test library?

"SnowflakePlugin" should "process Snowflake relation providers" in {
// Setup
val spark = mock[SparkSession]
val plugin = new SnowflakePlugin(spark)

val options = Map(
"sfUrl" -> "test-url",
"sfWarehouse" -> "test-warehouse",
"sfDatabase" -> "test-database",
"sfSchema" -> "test-schema",
"sfUser" -> "user1",
"dbtable" -> "test-table"
)

val cmd = mock[SaveIntoDataSourceCommand]
when(cmd.options) thenReturn(options)
when(cmd.mode) thenReturn(SaveMode.Overwrite)
when(cmd.query) thenReturn(null)

// Mocking the relation provider to be Snowflake
val snowflakeRP = "net.snowflake.spark.snowflake.DefaultSource"

// Execute
val result = plugin.relationProviderProcessor((snowflakeRP, cmd))

// Verify
val expectedSourceId = SourceIdentifier(Some("snowflake"), "snowflake://test-url.test-warehouse.test-database.test-schema.test-table")
result shouldEqual WriteNodeInfo(expectedSourceId, SaveMode.Overwrite, null, options)
}

it should "not process non-Snowflake relation providers" in {
// Setup
val spark = mock[SparkSession]
val plugin = new SnowflakePlugin(spark)

val cmd = mock[SaveIntoDataSourceCommand]

// Mocking the relation provider to be non-Snowflake
val nonSnowflakeRP = "some.other.datasource"

// Execute & Verify
assertThrows[MatchError] {
plugin.relationProviderProcessor((nonSnowflakeRP, cmd))
}
}
}
11 changes: 11 additions & 0 deletions integration-tests/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@
<artifactId>spark-cobol_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>net.snowflake</groupId>
<artifactId>spark-snowflake_${scala.binary.version}</artifactId>
<optional>true</optional>
</dependency>

<!-- to force newer version of jackson-annotations - needed for testcontainers -->
<dependency>
Expand Down Expand Up @@ -163,6 +168,12 @@
<version>${testcontainers.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>localstack</artifactId>
<version>1.19.8</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.elasticsearch</groupId>
Expand Down
106 changes: 106 additions & 0 deletions integration-tests/src/test/scala/za/co/absa/spline/SnowflakeSpec.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Copyright 2019 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.spline

import org.apache.spark.sql.{Row, RowFactory}
import org.scalatest.BeforeAndAfterAll
import org.scalatest.flatspec.AsyncFlatSpec
import org.scalatest.matchers.should.Matchers
import org.testcontainers.containers.GenericContainer
import org.testcontainers.containers.wait.strategy.Wait
import org.testcontainers.utility.DockerImageName
import za.co.absa.spline.commons.io.TempDirectory
import za.co.absa.spline.test.fixture.spline.SplineFixture
import za.co.absa.spline.test.fixture.{ReleasableResourceFixture, SparkFixture}

import java.util

class SnowflakeSpec
extends AsyncFlatSpec
with BeforeAndAfterAll
with Matchers
with SparkFixture
with SplineFixture
with ReleasableResourceFixture {

val tableName = "testTable"
val schemaName = "testSchema"
val warehouseName = "testWarehouse"
val databaseName = "test"
val sparkFormat = "net.snowflake.spark.snowflake"

it should "support snowflake as a read and write source" in {
usingResource(new GenericContainer(DockerImageName.parse("localstack/snowflake"))) { container =>
container.start()
Wait.forHealthcheck

val host = container.getHost

withNewSparkSession { implicit spark =>

withLineageTracking { captor =>
val sfOptions = Map(
"sfURL" -> "snowflake.localhost.localstack.cloud",
"sfUser" -> "test",
"sfPassword" -> "test",
"sfDatabase" -> databaseName,
"sfWarehouse" -> warehouseName,
"sfSchema" -> schemaName
)

// Define your data as a Java List
val data = new util.ArrayList[Row]()
data.add(RowFactory.create(1.asInstanceOf[Object]))
data.add(RowFactory.create(2.asInstanceOf[Object]))
data.add(RowFactory.create(3.asInstanceOf[Object]))

// Use the method to create DataFrame
val testData = spark.sqlContext.createDataFrame(data, classOf[Row])

for {
(writePlan, _) <- captor.lineageOf(
testData.write
.format(sparkFormat)
.options(sfOptions)
.option("dbtable", tableName)
.mode("overwrite")
.save()
)

(readPlan, _) <- captor.lineageOf {
val df = spark.read.format(sparkFormat)
.options(sfOptions)
.option("dbtable", tableName) // specify the source table
.load()

df.write.save(TempDirectory(pathOnly = true).deleteOnExit().path.toString)
}
} yield {
writePlan.operations.write.append shouldBe false
writePlan.operations.write.extra("destinationType") shouldBe Some("snowflake")
writePlan.operations.write.outputSource shouldBe s"snowflake://$host.$warehouseName.$databaseName.$schemaName.$tableName"

readPlan.operations.reads.head.inputSources.head shouldBe writePlan.operations.write.outputSource
readPlan.operations.reads.head.extra("sourceType") shouldBe Some("snowflake")
readPlan.operations.write.append shouldBe false
}
}
}
}
}

}
7 changes: 6 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@

<!-- Spark -->

<spark.version>${spark-24.version}</spark.version>
<spark.version>${spark-33.version}</spark.version>
<spark-22.version>2.2.3</spark-22.version>
<spark-23.version>2.3.4</spark-23.version>
<spark-24.version>2.4.8</spark-24.version>
Expand Down Expand Up @@ -472,6 +472,11 @@
<artifactId>mongo-spark-connector_${scala.binary.version}</artifactId>
<version>2.4.1</version>
</dependency>
<dependency>
<groupId>net.snowflake</groupId>
<artifactId>spark-snowflake_${scala.binary.version}</artifactId>
<version>2.16.0-spark_3.3</version>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The connector version 2.16.0-spark_3.3 specifically says it's compiled for the Spark 3.3. What about the other Spark versions? Will the plugin still be binary compatible?

</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
Expand Down