Skip to content

Commit

Permalink
Merge branch 'master' into parse_json
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhao-db committed Mar 12, 2024
2 parents a56b0db + 8fcef16 commit 9546bbd
Show file tree
Hide file tree
Showing 186 changed files with 5,205 additions and 2,163 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ jobs:
- name: Install Python packages (Python 3.9)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
run: |
python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1'
python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
python3.9 -m pip list
# Run the tests.
- name: Run tests
Expand Down Expand Up @@ -574,9 +574,8 @@ jobs:
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
- name: Install Buf
uses: bufbuild/buf-setup-action@v1.29.0
uses: bufbuild/buf-setup-action@v1
with:
version: 1.29.0
github_token: ${{ secrets.GITHUB_TOKEN }}
- name: Protocol Buffers Linter
uses: bufbuild/buf-lint-action@v1
Expand Down Expand Up @@ -703,7 +702,7 @@ jobs:
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.59.3' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: Python linter
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/maven_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ jobs:
- name: Install Python packages (Python 3.11)
if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
run: |
python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1'
python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
python3.11 -m pip list
# Run the tests.
- name: Run tests
Expand Down
3 changes: 1 addition & 2 deletions R/pkg/tests/fulltests/test_streaming.R
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,7 @@ test_that("Trigger", {
"Value for trigger.processingTime must be a non-empty string.")

expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
trigger.processingTime = "invalid"),
"Error parsing 'invalid' to interval, unrecognized number 'invalid'")
trigger.processingTime = "invalid"))

expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
trigger.once = ""), "Value for trigger.once must be TRUE.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ public Collation(
// No custom comparators will be used for this collation.
// Instead, we rely on byte for byte comparison.
collationTable[0] = new Collation(
"UCS_BASIC",
"UTF8_BINARY",
null,
UTF8String::binaryCompare,
"1.0",
Expand All @@ -127,7 +127,7 @@ public Collation(
// Case-insensitive UTF8 binary collation.
// TODO: Do in place comparisons instead of creating new strings.
collationTable[1] = new Collation(
"UCS_BASIC_LCASE",
"UTF8_BINARY_LCASE",
null,
(s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()),
"1.0",
Expand All @@ -138,11 +138,13 @@ public Collation(
collationTable[2] = new Collation(
"UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true);
collationTable[2].collator.setStrength(Collator.TERTIARY);
collationTable[2].collator.freeze();

// UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary strength).
collationTable[3] = new Collation(
"UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false);
collationTable[3].collator.setStrength(Collator.SECONDARY);
collationTable[3].collator.freeze();

for (int i = 0; i < collationTable.length; i++) {
collationNameToIdMap.put(collationTable[i].collationName, i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,10 +379,14 @@ public boolean matchAt(final UTF8String s, int pos) {
}

private boolean matchAt(final UTF8String s, int pos, int collationId) {
if (s.numBytes + pos > numBytes || pos < 0) {
if (s.numChars() + pos > this.numChars() || pos < 0) {
return false;
}
return this.substring(pos, pos + s.numBytes).semanticCompare(s, collationId) == 0;
if (s.numBytes == 0 || this.numBytes == 0) {
return s.numBytes == 0;
}
return CollationFactory.getStringSearch(this.substring(pos, pos + s.numChars()),
s, collationId).last() == 0;
}

public boolean startsWith(final UTF8String prefix) {
Expand Down Expand Up @@ -1456,7 +1460,7 @@ public int compareTo(@Nonnull final UTF8String other) {
}

/**
* Binary comparison of two UTF8String. Can only be used for default UCS_BASIC collation.
* Binary comparison of two UTF8String. Can only be used for default UTF8_BINARY collation.
*/
public int binaryCompare(final UTF8String other) {
return ByteArray.compareBinary(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}

class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite
test("collationId stability") {
val ucsBasic = fetchCollation(0)
assert(ucsBasic.collationName == "UCS_BASIC")
assert(ucsBasic.isBinaryCollation)
val utf8Binary = fetchCollation(0)
assert(utf8Binary.collationName == "UTF8_BINARY")
assert(utf8Binary.isBinaryCollation)

val ucsBasicLcase = fetchCollation(1)
assert(ucsBasicLcase.collationName == "UCS_BASIC_LCASE")
assert(!ucsBasicLcase.isBinaryCollation)
val utf8BinaryLcase = fetchCollation(1)
assert(utf8BinaryLcase.collationName == "UTF8_BINARY_LCASE")
assert(!utf8BinaryLcase.isBinaryCollation)

val unicode = fetchCollation(2)
assert(unicode.collationName == "UNICODE")
Expand All @@ -48,27 +48,27 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig

test("fetch invalid collation name") {
val error = intercept[SparkException] {
fetchCollation("UCS_BASIS")
fetchCollation("UTF8_BS")
}

assert(error.getErrorClass === "COLLATION_INVALID_NAME")
assert(error.getMessageParameters.asScala ===
Map("proposal" -> "UCS_BASIC", "collationName" -> "UCS_BASIS"))
Map("proposal" -> "UTF8_BINARY", "collationName" -> "UTF8_BS"))
}

case class CollationTestCase[R](collationName: String, s1: String, s2: String, expectedResult: R)

test("collation aware equality and hash") {
val checks = Seq(
CollationTestCase("UCS_BASIC", "aaa", "aaa", true),
CollationTestCase("UCS_BASIC", "aaa", "AAA", false),
CollationTestCase("UCS_BASIC", "aaa", "bbb", false),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "aaa", true),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AAA", true),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AaA", true),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AaA", true),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "aa", false),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "bbb", false),
CollationTestCase("UTF8_BINARY", "aaa", "aaa", true),
CollationTestCase("UTF8_BINARY", "aaa", "AAA", false),
CollationTestCase("UTF8_BINARY", "aaa", "bbb", false),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aaa", true),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AAA", true),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aa", false),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "bbb", false),
CollationTestCase("UNICODE", "aaa", "aaa", true),
CollationTestCase("UNICODE", "aaa", "AAA", false),
CollationTestCase("UNICODE", "aaa", "bbb", false),
Expand All @@ -89,16 +89,16 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig

test("collation aware compare") {
val checks = Seq(
CollationTestCase("UCS_BASIC", "aaa", "aaa", 0),
CollationTestCase("UCS_BASIC", "aaa", "AAA", 1),
CollationTestCase("UCS_BASIC", "aaa", "bbb", -1),
CollationTestCase("UCS_BASIC", "aaa", "BBB", 1),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "aaa", 0),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AAA", 0),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AaA", 0),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "AaA", 0),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "aa", 1),
CollationTestCase("UCS_BASIC_LCASE", "aaa", "bbb", -1),
CollationTestCase("UTF8_BINARY", "aaa", "aaa", 0),
CollationTestCase("UTF8_BINARY", "aaa", "AAA", 1),
CollationTestCase("UTF8_BINARY", "aaa", "bbb", -1),
CollationTestCase("UTF8_BINARY", "aaa", "BBB", 1),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aaa", 0),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AAA", 0),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", 0),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", 0),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aa", 1),
CollationTestCase("UTF8_BINARY_LCASE", "aaa", "bbb", -1),
CollationTestCase("UNICODE", "aaa", "aaa", 0),
CollationTestCase("UNICODE", "aaa", "AAA", -1),
CollationTestCase("UNICODE", "aaa", "bbb", -1),
Expand Down
Loading

0 comments on commit 9546bbd

Please sign in to comment.