From c0de416a1cb4c3daaf378dbed5877aefc1e2e620 Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Thu, 12 Dec 2024 09:16:49 -0600 Subject: [PATCH] detect and reject ambiguous equality predicates; testAmbiguousPredicates passes --- .../cassandra/cql3/SingleColumnRelation.java | 27 +++- .../apache/cassandra/index/IndexRegistry.java | 90 ++++++++++- .../analyzer/AnalyzerEqOperatorSupport.java | 9 +- .../org/apache/cassandra/cql3/CQLTester.java | 5 + .../cassandra/index/sai/cql/BM25Test.java | 146 +++++++++++++++--- 5 files changed, 242 insertions(+), 35 deletions(-) diff --git a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java index 89ae1d38b8a3..4dc6aaa78a7c 100644 --- a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java +++ b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.cassandra.db.marshal.VectorType; import org.apache.cassandra.index.IndexRegistry; @@ -194,15 +195,27 @@ protected Restriction newEQRestriction(TableMetadata table, VariableSpecificatio if (mapKey == null) { Term term = toTerm(toReceivers(columnDef), value, table.keyspace, boundNames); - var analyzedIndex = IndexRegistry.obtain(table).supportsAnalyzedEq(columnDef); - if (analyzedIndex == null) + // Leave the restriction as EQ if no analyzed index in backwards compatibility mode is present + var ebi = IndexRegistry.obtain(table).getEqBehavior(columnDef); + if (ebi.behavior == IndexRegistry.EqBehavior.EQ) return new SingleColumnRestriction.EQRestriction(columnDef, term); - ClientWarn.instance.warn(String.format(AnalyzerEqOperatorSupport.EQ_RESTRICTION_ON_ANALYZED_WARNING, - columnDef.toString(), - analyzedIndex.getIndexMetadata().name), - columnDef); - return new SingleColumnRestriction.AnalyzerMatchesRestriction(columnDef, term); + // the index is configured to transform EQ into MATCH for backwards compatibility + if (ebi.behavior == IndexRegistry.EqBehavior.MATCH) + { + ClientWarn.instance.warn(String.format(AnalyzerEqOperatorSupport.EQ_RESTRICTION_ON_ANALYZED_WARNING, + columnDef.toString(), + ebi.matchIndex.getIndexMetadata().name), + columnDef); + return new SingleColumnRestriction.AnalyzerMatchesRestriction(columnDef, term); + } + + // multiple indexes support EQ, this is unsupported + assert ebi.behavior == IndexRegistry.EqBehavior.AMBIGUOUS; + throw invalidRequest(AnalyzerEqOperatorSupport.EQ_AMBIGUOUS_ERROR, + columnDef.toString(), + ebi.matchIndex.getIndexMetadata().name, + ebi.eqIndex.getIndexMetadata().name); } List receivers = toReceivers(columnDef); Term entryKey = toTerm(Collections.singletonList(receivers.get(0)), mapKey, table.keyspace, boundNames); diff --git a/src/java/org/apache/cassandra/index/IndexRegistry.java b/src/java/org/apache/cassandra/index/IndexRegistry.java index ec41a068d9ac..0ba923d9b3c3 100644 --- a/src/java/org/apache/cassandra/index/IndexRegistry.java +++ b/src/java/org/apache/cassandra/index/IndexRegistry.java @@ -343,14 +343,94 @@ public static IndexRegistry obtain(TableMetadata table) return table.isVirtual() ? EMPTY : Keyspace.openAndGetStore(table).indexManager; } - default Index supportsAnalyzedEq(ColumnMetadata cm) + enum EqBehavior { + EQ, + MATCH, + AMBIGUOUS + } + + class EqBehaviorIndexes + { + public EqBehavior behavior; + public final Index eqIndex; + public final Index matchIndex; + + private EqBehaviorIndexes(Index eqIndex, Index matchIndex, EqBehavior behavior) + { + this.eqIndex = eqIndex; + this.matchIndex = matchIndex; + this.behavior = behavior; + } + + public static EqBehaviorIndexes eq(Index eqIndex) + { + return new EqBehaviorIndexes(eqIndex, null, EqBehavior.EQ); + } + + public static EqBehaviorIndexes match(Index eqAndMatchIndex) + { + return new EqBehaviorIndexes(eqAndMatchIndex, eqAndMatchIndex, EqBehavior.MATCH); + } + + public static EqBehaviorIndexes ambiguous(Index firstEqIndex, Index secondEqIndex) + { + return new EqBehaviorIndexes(firstEqIndex, secondEqIndex, EqBehavior.AMBIGUOUS); + } + } + + /** + * @return - EQ if a single index supports EQ + * - MATCHES if a single index supports both + * - AMBIGUOUS if multiple indexes support EQ + */ + default EqBehaviorIndexes getEqBehavior(ColumnMetadata cm) + { + // scan the indexes for MATCHES and EQ support + Index matchesIndex = null; + Index eqIndex = null; for (Index index : listIndexes()) { - if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES) && - index.supportsExpression(cm, Operator.EQ)) - return index; + if (index.supportsExpression(cm, Operator.EQ)) + { + if (eqIndex == null) + { + eqIndex = index; + continue; + } + + // If we find a second EQ index, return AMBIGUOUS, taking care to assign the eqIndex and matchIndex correctly + if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES)) + { + matchesIndex = index; + } + else + { + assert eqIndex.supportsExpression(cm, Operator.ANALYZER_MATCHES); + matchesIndex = eqIndex; + eqIndex = index; + } + return EqBehaviorIndexes.ambiguous(eqIndex, matchesIndex); + } + + if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES)) + { + // should only ever have one + assert matchesIndex == null; + matchesIndex = index; + } } - return null; + + // If we didn't find any indexes that support EQ or MATCHES, return EQ + if (eqIndex == null && matchesIndex == null) + return EqBehaviorIndexes.eq(null); + + // If the same index supports both EQ and MATCHES, promote to MATCHES + if (eqIndex == matchesIndex) + return EqBehaviorIndexes.match(matchesIndex); + + // Otherwise we either have no MATCHES index, or it's distinct from the EQ index. + // In both cases we want to return EQ + return EqBehaviorIndexes.eq(eqIndex); } } diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/AnalyzerEqOperatorSupport.java b/src/java/org/apache/cassandra/index/sai/analyzer/AnalyzerEqOperatorSupport.java index 116bc7f62832..30408c9b986f 100644 --- a/src/java/org/apache/cassandra/index/sai/analyzer/AnalyzerEqOperatorSupport.java +++ b/src/java/org/apache/cassandra/index/sai/analyzer/AnalyzerEqOperatorSupport.java @@ -49,7 +49,7 @@ public class AnalyzerEqOperatorSupport OPTION, Arrays.toString(Value.values())); public static final String EQ_RESTRICTION_ON_ANALYZED_WARNING = - String.format("Columns [%%s] are restricted by '=' and have analyzed indexes [%%s] able to process those restrictions. " + + String.format("Column [%%s] is restricted by '=' and has an analyzed index [%%s] able to process those restrictions. " + "Analyzed indexes might process '=' restrictions in a way that is inconsistent with non-indexed queries. " + "While '=' is still supported on analyzed indexes for backwards compatibility, " + "it is recommended to use the ':' operator instead to prevent the ambiguity. " + @@ -58,6 +58,13 @@ public class AnalyzerEqOperatorSupport "please use '%s':'%s' in the index options.", OPTION, Value.UNSUPPORTED.toString().toLowerCase()); + public static final String EQ_AMBIGUOUS_ERROR = + String.format("Column [%%s] equality predicate is ambiguous. It has both an analyzed index [%%s] configured with '%s':'%s', " + + "and an un-analyzed index [%%s]. " + + "To avoid ambiguity, drop the analyzed index and recreate it with option '%s':'%s'.", + OPTION, Value.MATCH.toString().toLowerCase(), OPTION, Value.UNSUPPORTED.toString().toLowerCase()); + + public static final String LWT_CONDITION_ON_ANALYZED_WARNING = "Index analyzers not applied to LWT conditions on columns [%s]."; diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index 5a6ec34b02ef..a28a76b95524 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -803,6 +803,11 @@ protected String currentIndex() return indexes.get(indexes.size() - 1); } + protected String getIndex(int i) + { + return indexes.get(i); + } + protected Collection currentTables() { if (tables == null || tables.isEmpty()) diff --git a/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java b/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java index cd6457f162fe..69d51a19be44 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java @@ -20,33 +20,16 @@ import org.junit.Test; -import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sai.SAITester; import org.apache.cassandra.index.sai.plan.QueryController; -import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.apache.cassandra.index.sai.analyzer.AnalyzerEqOperatorSupport.EQ_AMBIGUOUS_ERROR; import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat; public class BM25Test extends SAITester { @Test - public void testMatchingAllowed() throws Throwable - { - // match operator should be allowed with BM25 on the same column - // (seems obvious but exercises a corner case in the internal RestrictionSet processing) - createSimpleTable(); - - execute("INSERT INTO %s (k, v) VALUES (1, 'apple')"); - - beforeAndAfterFlush(() -> - { - var result = execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3"); - assertRows(result, row(1)); - }); - } - - @Test - public void testTwoIndexes() throws Throwable + public void testTwoIndexes() { // create un-analyzed index createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)"); @@ -54,9 +37,8 @@ public void testTwoIndexes() throws Throwable execute("INSERT INTO %s (k, v) VALUES (1, 'apple')"); // BM25 should fail with only an equality index - assertThatThrownBy(() -> execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3")) - .isInstanceOf(InvalidRequestException.class) - .hasMessage("BM25 ordering on column v requires an analyzed index"); + assertInvalidMessage("BM25 ordering on column v requires an analyzed index", + "SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3"); // create analyzed index analyzeIndex(); @@ -65,6 +47,126 @@ public void testTwoIndexes() throws Throwable assertRows(result, row(1)); } + @Test + public void testTwoIndexesAmbiguousPredicate() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)"); + + // Create analyzed and un-analyzed indexes + analyzeIndex(); + createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"); + + execute("INSERT INTO %s (k, v) VALUES (1, 'apple')"); + execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')"); + execute("INSERT INTO %s (k, v) VALUES (3, 'orange juice')"); + + // equality predicate is ambiguous (both analyzed and un-analyzed indexes could support it) so it should + // be rejected + beforeAndAfterFlush(() -> { + // Single predicate + assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)), + "SELECT k FROM %s WHERE v = 'apple'"); + + // AND + assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)), + "SELECT k FROM %s WHERE v = 'apple' AND v : 'juice'"); + + // OR + assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)), + "SELECT k FROM %s WHERE v = 'apple' OR v : 'juice'"); + }); + } + + @Test + public void testTwoIndexesWithEqualsUnsupported() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)"); + createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"); + // analyzed index with equals_behavior:unsupported option + createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " + + "WITH OPTIONS = { 'equals_behaviour_when_analyzed': 'unsupported', " + + "'index_analyzer':'{\"tokenizer\":{\"name\":\"standard\"},\"filters\":[{\"name\":\"porterstem\"}]}' }"); + + execute("INSERT INTO %s (k, v) VALUES (1, 'apple')"); + execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')"); + + beforeAndAfterFlush(() -> { + // combining two EQ predicates is not allowed + assertInvalid("SELECT k FROM %s WHERE v = 'apple' AND v = 'juice'"); + + // combining EQ and MATCH predicates is also not allowed (when we're not converting EQ to MATCH) + assertInvalid("SELECT k FROM %s WHERE v = 'apple' AND v : 'apple'"); + + // combining two MATCH predicates is fine + assertRows(execute("SELECT k FROM %s WHERE v : 'apple' AND v : 'juice'"), + row(2)); + + // = operator should use un-analyzed index since equals is unsupported in analyzed index + assertRows(execute("SELECT k FROM %s WHERE v = 'apple'"), + row(1)); + + // : operator should use analyzed index + assertRows(execute("SELECT k FROM %s WHERE v : 'apple'"), + row(1), row(2)); + }); + } + + @Test + public void testComplexQueriesWithMultipleIndexes() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 text, v2 text, v3 int)"); + + // Create mix of analyzed, unanalyzed, and non-text indexes + createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"); + createIndex("CREATE CUSTOM INDEX ON %s(v2) " + + "USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " + + "WITH OPTIONS = {" + + "'index_analyzer': '{" + + "\"tokenizer\" : {\"name\" : \"standard\"}, " + + "\"filters\" : [{\"name\" : \"porterstem\"}]" + + "}'" + + "}"); + createIndex("CREATE CUSTOM INDEX ON %s(v3) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"); + + execute("INSERT INTO %s (k, v1, v2, v3) VALUES (1, 'apple', 'orange juice', 5)"); + execute("INSERT INTO %s (k, v1, v2, v3) VALUES (2, 'apple juice', 'apple', 10)"); + execute("INSERT INTO %s (k, v1, v2, v3) VALUES (3, 'banana', 'grape juice', 5)"); + + beforeAndAfterFlush(() -> { + // Complex query mixing different types of indexes and operators + assertRows(execute("SELECT k FROM %s WHERE v1 = 'apple' AND v2 : 'juice' AND v3 = 5"), + row(1)); + + // Mix of AND and OR conditions across different index types + assertRows(execute("SELECT k FROM %s WHERE v3 = 5 AND (v1 = 'apple' OR v2 : 'apple')"), + row(1)); + + // Multi-term analyzed query + assertRows(execute("SELECT k FROM %s WHERE v2 : 'orange juice'"), + row(1)); + + // Range query with text match + assertRows(execute("SELECT k FROM %s WHERE v3 >= 5 AND v2 : 'juice'"), + row(1), row(3)); + }); + } + + @Test + public void testMatchingAllowed() throws Throwable + { + // match operator should be allowed with BM25 on the same column + // (seems obvious but exercises a corner case in the internal RestrictionSet processing) + createSimpleTable(); + + execute("INSERT INTO %s (k, v) VALUES (1, 'apple')"); + + beforeAndAfterFlush(() -> + { + var result = execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3"); + assertRows(result, row(1)); + }); + } + @Test public void testTermFrequencyOrdering() throws Throwable {