Skip to content

Commit

Permalink
SPYT-777 Fix non-latin symbols using in query tracker livy responses
Browse files Browse the repository at this point in the history
* Changelog entry
Type: fix
Component: spyt

Fix non-latin symbols using in query tracker livy responses
commit_hash:dd4cb163e6d623076d9be18961bc1b03d1c0519a
  • Loading branch information
alextokarew committed Feb 4, 2025
1 parent 184cd6c commit 44b5244
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import tech.ytsaurus.spyt.serialization.YsonEncoder
import tech.ytsaurus.spyt.serializers.SchemaConverter.Unordered
import tech.ytsaurus.spyt.types.YTsaurusTypes

import java.nio.charset.StandardCharsets
import java.util.Base64

// TODO(alex-shishkin): Supported type v1 only
Expand All @@ -27,7 +28,7 @@ class GenericRowSerializer(schema: StructType) {
sparkField.dataType match {
case BinaryType => boxValue(i, row.getAs[Array[Byte]](i))

case StringType => boxValue(i, row.getString(i).getBytes)
case StringType => boxValue(i, row.getString(i).getBytes(StandardCharsets.UTF_8))
case t@(ArrayType(_, _) | StructType(_) | MapType(_, _, _)) =>
val skipNulls = sparkField.metadata.contains("skipNulls") && sparkField.metadata.getBoolean("skipNulls")
boxValue(i, YsonEncoder.encode(row.get(i), t, skipNulls))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ class DataFrameSerializerTest extends FlatSpec with Matchers with LocalSpark
.addValue("value", ColumnValueType.ANY)
.build()

private val nonLatinSchema = TableSchema.builder()
.addValue("id", ColumnValueType.INT64)
.addValue("value", ColumnValueType.STRING)
.build()

it should "serialize dataframe to byte array" in {
writeTableFromYson(Seq(
"""{a = 0; b = #; c = 0.0}"""
Expand Down Expand Up @@ -90,4 +95,21 @@ class DataFrameSerializerTest extends FlatSpec with Matchers with LocalSpark
)
tableBytes should contain theSameElementsAs answer
}

it should "serialize non-latin symbols in unicode" in {
writeTableFromYson(Seq(
"""{id = 1; value = "Номер один"}""",
"""{id = 2; value = "Номер два"}"""
), tmpPath, nonLatinSchema)

val res = spark.read.yt(tmpPath)
val resultBase64 = GenericRowSerializer.dfToYTFormatWithBase64(res)

resultBase64 should contain theSameElementsAs Seq(
"bgAAAAAAAAAKMgoCaWQQA0gAWih7InR5cGVfbmFtZSI9Im9wdGlvbmFsIjsiaXRlbSI9ImludDY0Ijt9CjYKBXZhbHVlEBBIAFopeyJ0e" +
"XBlX25hbWUiPSJvcHRpb25hbCI7Iml0ZW0iPSJzdHJpbmciO30YAAAAAgAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAATAAAA" +
"AAAAANCd0L7QvNC10YAg0L7QtNC40L0AAAAAAAIAAAAAAAAAAAAAAAAAAAACAAAAAAAAABEAAAAAAAAA0J3QvtC80LXRgCDQtNCy0LA" +
"AAAAAAAAA"
)
}
}
21 changes: 21 additions & 0 deletions e2e-test/tests/test_livy.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,24 @@ def test_livy_server(yt_client, tmp_dir, livy_server):
assert extract_data(output) == 'NwAAAAAAAAAKMwoCaWQQEEgAWil7InR5cGVfbmFtZSI9Im9wdGlvbmFsIjsiaXRlbSI9InN0cm' \
'luZyI7fRgAAAIAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAUAAAAAAAAAMHgxMjMAAAABAAAAAAAA' \
'AAAAAAAAAAAACQAAAAAAAAAweEFCQUNBQkEAAAAAAAAA'


def test_read_non_latin_symbols(yt_client, tmp_dir, livy_server):
table = f"{tmp_dir}/non_latin_table"
yt_client.create("table", table, attributes={"schema": [{"name": "id", "type": "int64"},
{"name": "value", "type": "string"}]})
yt_client.write_table(table, [{"id": 1, "value": "Номер один"},
{"id": 2, "value": "Номер два"},
{"id": 3, "value": "Что то ещё"}])

host = "http://" + livy_server.rest()
session_url = create_session(host)

query = f"select * from yt.`ytTable:/{table}` WHERE value LIKE 'Номер%' ORDER BY id"
output = run_code(host, session_url, wrap_sql_query(query))
# The expected result is the same as in DataFrameSerializerTest::"serialize non-latin symbols in unicode" unit test
assert extract_data(output) == 'bgAAAAAAAAAKMgoCaWQQA0gAWih7InR5cGVfbmFtZSI9Im9wdGlvbmFsIjsiaXRlbSI9ImludD' \
'Y0Ijt9CjYKBXZhbHVlEBBIAFopeyJ0eXBlX25hbWUiPSJvcHRpb25hbCI7Iml0ZW0iPSJzdHJp' \
'bmciO30YAAAAAgAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAATAAAAAAAAANCd0L7QvN' \
'C10YAg0L7QtNC40L0AAAAAAAIAAAAAAAAAAAAAAAAAAAACAAAAAAAAABEAAAAAAAAA0J3QvtC8' \
'0LXRgCDQtNCy0LAAAAAAAAAA'
3 changes: 3 additions & 0 deletions spyt-package/src/main/python/spyt/enabler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def apply_config(self, config):
self.enable_solomon_agent = self._get_enabler(self.enable_solomon_agent, "enable_solomon_agent",
self.SOLOMON_AGENT_KEY)
self.enable_tcp_proxy = self._get_enabler(self.enable_tcp_proxy, "enable_tcp_proxy", self.TCP_PROXY_KEY)
self.enable_preference_ipv6 = self._get_enabler(self.enable_preference_ipv6, "enable_preference_ipv6",
self.IPV6_KEY)
self.enable_squashfs = self._get_enabler(self.enable_squashfs, "enable_squashfs", self.SQUASHFS_KEY)

def __str__(self):
return json.dumps(self.get_conf())
Expand Down

0 comments on commit 44b5244

Please sign in to comment.