From f1475b1e76de10b59902283a33592e4956ce4a2f Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Thu, 25 Apr 2024 08:09:08 -0700 Subject: [PATCH 1/8] Increase wait time SearchIdleIT#testSearchIdleStats (#107881) --- .../java/org/elasticsearch/index/shard/SearchIdleIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/SearchIdleIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/SearchIdleIT.java index 199a397f52ad2..d9100e1e631db 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/SearchIdleIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/SearchIdleIT.java @@ -224,7 +224,7 @@ public void testSearchIdleStats() throws InterruptedException { .get(); waitUntil( () -> Arrays.stream(indicesAdmin().prepareStats(indexName).get().getShards()).allMatch(ShardStats::isSearchIdle), - searchIdleAfter, + searchIdleAfter + 1, TimeUnit.SECONDS ); From c03dfb1ddd3bd4409e23f08d44b473dcb7c44c08 Mon Sep 17 00:00:00 2001 From: David Kyle Date: Thu, 25 Apr 2024 16:18:51 +0100 Subject: [PATCH 2/8] finish muting (#107906) Follow on from #107898 which left some tests unmuted --- .../elasticsearch/xpack/application/CohereServiceUpgradeIT.java | 1 + .../xpack/application/HuggingFaceServiceUpgradeIT.java | 1 + .../elasticsearch/xpack/application/OpenAiServiceUpgradeIT.java | 1 + 3 files changed, 3 insertions(+) diff --git a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/CohereServiceUpgradeIT.java b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/CohereServiceUpgradeIT.java index 73676aa730883..c73827dba2cbb 100644 --- a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/CohereServiceUpgradeIT.java +++ b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/CohereServiceUpgradeIT.java @@ -169,6 +169,7 @@ void assertEmbeddingInference(String inferenceId, CohereEmbeddingType type) thro } @SuppressWarnings("unchecked") + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/107887") public void testRerank() throws IOException { var rerankSupported = getOldClusterTestVersion().onOrAfter(COHERE_RERANK_ADDED); assumeTrue("Cohere rerank service added in " + COHERE_RERANK_ADDED, rerankSupported); diff --git a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/HuggingFaceServiceUpgradeIT.java b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/HuggingFaceServiceUpgradeIT.java index 7e78b83223acf..718678f97f37f 100644 --- a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/HuggingFaceServiceUpgradeIT.java +++ b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/HuggingFaceServiceUpgradeIT.java @@ -100,6 +100,7 @@ void assertEmbeddingInference(String inferenceId) throws IOException { } @SuppressWarnings("unchecked") + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/107887") public void testElser() throws IOException { var supported = getOldClusterTestVersion().onOrAfter(HF_ELSER_ADDED); assumeTrue("HF elser service added in " + HF_ELSER_ADDED, supported); diff --git a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/OpenAiServiceUpgradeIT.java b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/OpenAiServiceUpgradeIT.java index ac0a71ebb2c82..4e8e1c845b070 100644 --- a/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/OpenAiServiceUpgradeIT.java +++ b/x-pack/plugin/inference/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/xpack/application/OpenAiServiceUpgradeIT.java @@ -111,6 +111,7 @@ void assertEmbeddingInference(String inferenceId) throws IOException { } @SuppressWarnings("unchecked") + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/107887") public void testOpenAiCompletions() throws IOException { var openAiEmbeddingsSupported = getOldClusterTestVersion().onOrAfter(OPEN_AI_COMPLETIONS_ADDED); assumeTrue("OpenAI completions service added in " + OPEN_AI_COMPLETIONS_ADDED, openAiEmbeddingsSupported); From 4e7a833418f73c830ac583c67bdb2674ca5b7e22 Mon Sep 17 00:00:00 2001 From: Mark Vieira Date: Thu, 25 Apr 2024 08:42:39 -0700 Subject: [PATCH 3/8] Reduce inference rolling upgrade test parallelism --- x-pack/plugin/inference/qa/rolling-upgrade/build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/plugin/inference/qa/rolling-upgrade/build.gradle b/x-pack/plugin/inference/qa/rolling-upgrade/build.gradle index 328444dacaf53..381f46cdf22bd 100644 --- a/x-pack/plugin/inference/qa/rolling-upgrade/build.gradle +++ b/x-pack/plugin/inference/qa/rolling-upgrade/build.gradle @@ -28,6 +28,7 @@ BuildParams.bwcVersions.withWireCompatible(v -> v.after("8.11.0")) { bwcVersion, tasks.register(bwcTaskName(bwcVersion), StandaloneRestIntegTestTask) { usesBwcDistribution(bwcVersion) systemProperty("tests.old_cluster_version", bwcVersion) + maxParallelForks = 1 } } From a21242054b87e456f6e301dfa017e612b83ebbd6 Mon Sep 17 00:00:00 2001 From: Bogdan Pintea Date: Thu, 25 Apr 2024 18:38:12 +0200 Subject: [PATCH 4/8] ESQL: Document BUCKET as a grouping function (#107864) This adds the documentation for BUCKET as a grouping function and the addition of the "direct" invocation mode providing a span (in addition to the auto mode). --- .../esql/esql-functions-operators.asciidoc | 7 ++ docs/reference/esql/esql-get-started.asciidoc | 7 -- .../functions/aggregation-functions.asciidoc | 2 +- .../functions/date-time-functions.asciidoc | 2 - .../esql/functions/examples/bucket.asciidoc | 48 +++++++++- .../functions/grouping-functions.asciidoc | 14 +++ .../functions/kibana/definition/bucket.json | 2 + .../src/main/resources/bucket.csv-spec | 90 ++++++++++++------- .../expression/function/grouping/Bucket.java | 18 +++- 9 files changed, 145 insertions(+), 45 deletions(-) create mode 100644 docs/reference/esql/functions/grouping-functions.asciidoc diff --git a/docs/reference/esql/esql-functions-operators.asciidoc b/docs/reference/esql/esql-functions-operators.asciidoc index ddc077f3b8ff8..3ad61a8d56455 100644 --- a/docs/reference/esql/esql-functions-operators.asciidoc +++ b/docs/reference/esql/esql-functions-operators.asciidoc @@ -16,6 +16,12 @@ The reference documentation is divided into the following categories: include::functions/aggregation-functions.asciidoc[tag=agg_list] ==== +.*Grouping functions* +[%collapsible] +==== +include::functions/grouping-functions.asciidoc[tag=group_list] +==== + .*Math functions* [%collapsible] ==== @@ -68,6 +74,7 @@ include::functions/operators.asciidoc[tag=op_list] ==== include::functions/aggregation-functions.asciidoc[] +include::functions/grouping-functions.asciidoc[] include::functions/math-functions.asciidoc[] include::functions/string-functions.asciidoc[] include::functions/date-time-functions.asciidoc[] diff --git a/docs/reference/esql/esql-get-started.asciidoc b/docs/reference/esql/esql-get-started.asciidoc index 0e23c0d97e61b..663b2f8ecd249 100644 --- a/docs/reference/esql/esql-get-started.asciidoc +++ b/docs/reference/esql/esql-get-started.asciidoc @@ -244,13 +244,6 @@ To track statistics over time, {esql} enables you to create histograms using the and returns a value for each row that corresponds to the resulting bucket the row falls into. -For example, to create hourly buckets for the data on October 23rd: - -[source,esql] ----- -include::{esql-specs}/bucket.csv-spec[tag=gs-bucket] ----- - Combine `BUCKET` with <> to create a histogram. For example, to count the number of events per hour: diff --git a/docs/reference/esql/functions/aggregation-functions.asciidoc b/docs/reference/esql/functions/aggregation-functions.asciidoc index 2fdc8582d6bfb..074fcce9ad43d 100644 --- a/docs/reference/esql/functions/aggregation-functions.asciidoc +++ b/docs/reference/esql/functions/aggregation-functions.asciidoc @@ -5,7 +5,7 @@ Aggregate functions ++++ -The <> function supports these aggregate functions: +The <> command supports these aggregate functions: // tag::agg_list[] * <> diff --git a/docs/reference/esql/functions/date-time-functions.asciidoc b/docs/reference/esql/functions/date-time-functions.asciidoc index 75aeea40bc608..8ce26eaabe381 100644 --- a/docs/reference/esql/functions/date-time-functions.asciidoc +++ b/docs/reference/esql/functions/date-time-functions.asciidoc @@ -8,7 +8,6 @@ {esql} supports these date-time functions: // tag::date_list[] -* <> * <> * <> * <> @@ -17,7 +16,6 @@ * <> // end::date_list[] -include::layout/bucket.asciidoc[] include::layout/date_diff.asciidoc[] include::layout/date_extract.asciidoc[] include::layout/date_format.asciidoc[] diff --git a/docs/reference/esql/functions/examples/bucket.asciidoc b/docs/reference/esql/functions/examples/bucket.asciidoc index 0854840ffda34..f66f737b7d4b5 100644 --- a/docs/reference/esql/functions/examples/bucket.asciidoc +++ b/docs/reference/esql/functions/examples/bucket.asciidoc @@ -2,6 +2,10 @@ *Examples* +`BUCKET` can work in two modes: one in which the size of the bucket is computed +based on a buckets count recommendation (four parameters) and a range, and +another in which the bucket size is provided directly (two parameters). + Using a target number of buckets, a start of a range, and an end of a range, `BUCKET` picks an appropriate bucket size to generate the target number of buckets or fewer. For example, asking for at most 20 buckets over a year results in monthly buckets: @@ -17,7 +21,7 @@ include::{esql-specs}/bucket.csv-spec[tag=docsBucketMonth-result] The goal isn't to provide *exactly* the target number of buckets, it's to pick a range that people are comfortable with that provides at most the target number of buckets. -Combine `BUCKET` with <> to create a histogram: +Combine `BUCKET` with an <> to create a histogram: [source.merge.styled,esql] ---- include::{esql-specs}/bucket.csv-spec[tag=docsBucketMonthlyHistogram] @@ -28,7 +32,7 @@ include::{esql-specs}/bucket.csv-spec[tag=docsBucketMonthlyHistogram-result] |=== NOTE: `BUCKET` does not create buckets that don't match any documents. -+ "That's why this example is missing `1985-03-01` and other dates. +That's why this example is missing `1985-03-01` and other dates. Asking for more buckets can result in a smaller range. For example, asking for at most 100 buckets in a year results in weekly buckets: @@ -45,6 +49,20 @@ NOTE: `BUCKET` does not filter any rows. It only uses the provided range to pick For rows with a value outside of the range, it returns a bucket value that corresponds to a bucket outside the range. Combine`BUCKET` with <> to filter rows. +If the desired bucket size is known in advance, simply provide it as the second +argument, leaving the range out: +[source.merge.styled,esql] +---- +include::{esql-specs}/bucket.csv-spec[tag=docsBucketWeeklyHistogramWithSpan] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/bucket.csv-spec[tag=docsBucketWeeklyHistogramWithSpan-result] +|=== + +NOTE: When providing the bucket size as the second parameter, it must be a time +duration or date period. + `BUCKET` can also operate on numeric fields. For example, to create a salary histogram: [source.merge.styled,esql] ---- @@ -58,6 +76,20 @@ include::{esql-specs}/bucket.csv-spec[tag=docsBucketNumeric-result] Unlike the earlier example that intentionally filters on a date range, you rarely want to filter on a numeric range. You have to find the `min` and `max` separately. {esql} doesn't yet have an easy way to do that automatically. +The range can be omitted if the desired bucket size is known in advance. Simply +provide it as the second argument: +[source.merge.styled,esql] +---- +include::{esql-specs}/bucket.csv-spec[tag=docsBucketNumericWithSpan] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/bucket.csv-spec[tag=docsBucketNumericWithSpan-result] +|=== + +NOTE: When providing the bucket size as the second parameter, it must be +of a floating point type. + Create hourly buckets for the last 24 hours, and calculate the number of events per hour: [source.merge.styled,esql] ---- @@ -77,3 +109,15 @@ include::{esql-specs}/bucket.csv-spec[tag=bucket_in_agg] include::{esql-specs}/bucket.csv-spec[tag=bucket_in_agg-result] |=== +`BUCKET` may be used in both the aggregating and grouping part of the +<> command provided that in the aggregating +part the function is referenced by an alias defined in the +grouping part, or that it is invoked with the exact same expression: +[source.merge.styled,esql] +---- +include::{esql-specs}/bucket.csv-spec[tag=reuseGroupingFunctionWithExpression] +---- +[%header.monospaced.styled,format=dsv,separator=|] +|=== +include::{esql-specs}/bucket.csv-spec[tag=reuseGroupingFunctionWithExpression-result] +|=== diff --git a/docs/reference/esql/functions/grouping-functions.asciidoc b/docs/reference/esql/functions/grouping-functions.asciidoc new file mode 100644 index 0000000000000..ed0caf5ec2a4c --- /dev/null +++ b/docs/reference/esql/functions/grouping-functions.asciidoc @@ -0,0 +1,14 @@ +[[esql-group-functions]] +==== {esql} grouping functions + +++++ +Grouping functions +++++ + +The <> command supports these grouping functions: + +// tag::group_list[] +* <> +// end::group_list[] + +include::layout/bucket.asciidoc[] diff --git a/docs/reference/esql/functions/kibana/definition/bucket.json b/docs/reference/esql/functions/kibana/definition/bucket.json index bab6cef538b07..986c0e8f91d33 100644 --- a/docs/reference/esql/functions/kibana/definition/bucket.json +++ b/docs/reference/esql/functions/kibana/definition/bucket.json @@ -939,7 +939,9 @@ "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS hire_date = MV_SORT(VALUES(hire_date)) BY month = BUCKET(hire_date, 20, \"1985-01-01T00:00:00Z\", \"1986-01-01T00:00:00Z\")\n| SORT hire_date", "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS hires_per_month = COUNT(*) BY month = BUCKET(hire_date, 20, \"1985-01-01T00:00:00Z\", \"1986-01-01T00:00:00Z\")\n| SORT month", "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS hires_per_week = COUNT(*) BY week = BUCKET(hire_date, 100, \"1985-01-01T00:00:00Z\", \"1986-01-01T00:00:00Z\")\n| SORT week", + "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS hires_per_week = COUNT(*) BY week = BUCKET(hire_date, 1 week)\n| SORT week", "FROM employees\n| STATS COUNT(*) by bs = BUCKET(salary, 20, 25324, 74999)\n| SORT bs", + "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS c = COUNT(1) BY b = BUCKET(salary, 5000.)\n| SORT b", "FROM sample_data \n| WHERE @timestamp >= NOW() - 1 day and @timestamp < NOW()\n| STATS COUNT(*) BY bucket = BUCKET(@timestamp, 25, NOW() - 1 day, NOW())", "FROM employees\n| WHERE hire_date >= \"1985-01-01T00:00:00Z\" AND hire_date < \"1986-01-01T00:00:00Z\"\n| STATS AVG(salary) BY bucket = BUCKET(hire_date, 20, \"1985-01-01T00:00:00Z\", \"1986-01-01T00:00:00Z\")\n| SORT bucket" ] diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/bucket.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/bucket.csv-spec index ad8f16f89cf7e..f41bf3f020eb5 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/bucket.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/bucket.csv-spec @@ -175,13 +175,13 @@ FROM employees ; //tag::docsBucketMonthlyHistogram-result[] - hires_per_month:long | month:date -2 |1985-02-01T00:00:00.000Z -1 |1985-05-01T00:00:00.000Z -1 |1985-07-01T00:00:00.000Z -1 |1985-09-01T00:00:00.000Z -2 |1985-10-01T00:00:00.000Z -4 |1985-11-01T00:00:00.000Z + hires_per_month:long | month:date +2 |1985-02-01T00:00:00.000Z +1 |1985-05-01T00:00:00.000Z +1 |1985-07-01T00:00:00.000Z +1 |1985-09-01T00:00:00.000Z +2 |1985-10-01T00:00:00.000Z +4 |1985-11-01T00:00:00.000Z //end::docsBucketMonthlyHistogram-result[] ; @@ -196,15 +196,36 @@ FROM employees //tag::docsBucketWeeklyHistogram-result[] hires_per_week:long | week:date -2 |1985-02-18T00:00:00.000Z -1 |1985-05-13T00:00:00.000Z -1 |1985-07-08T00:00:00.000Z -1 |1985-09-16T00:00:00.000Z -2 |1985-10-14T00:00:00.000Z -4 |1985-11-18T00:00:00.000Z +2 |1985-02-18T00:00:00.000Z +1 |1985-05-13T00:00:00.000Z +1 |1985-07-08T00:00:00.000Z +1 |1985-09-16T00:00:00.000Z +2 |1985-10-14T00:00:00.000Z +4 |1985-11-18T00:00:00.000Z //end::docsBucketWeeklyHistogram-result[] ; +// bucketing in span mode (identical results to above) +docsBucketWeeklyHistogramWithSpan#[skip:-8.13.99, reason:BUCKET renamed in 8.14] +//tag::docsBucketWeeklyHistogramWithSpan[] +FROM employees +| WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" +| STATS hires_per_week = COUNT(*) BY week = BUCKET(hire_date, 1 week) +| SORT week +//end::docsBucketWeeklyHistogramWithSpan[] +; + +//tag::docsBucketWeeklyHistogramWithSpan-result[] + hires_per_week:long | week:date +2 |1985-02-18T00:00:00.000Z +1 |1985-05-13T00:00:00.000Z +1 |1985-07-08T00:00:00.000Z +1 |1985-09-16T00:00:00.000Z +2 |1985-10-14T00:00:00.000Z +4 |1985-11-18T00:00:00.000Z +//end::docsBucketWeeklyHistogramWithSpan-result[] +; + docsBucketLast24hr#[skip:-8.13.99, reason:BUCKET renamed in 8.14] //tag::docsBucketLast24hr[] FROM sample_data @@ -218,17 +239,6 @@ FROM sample_data //end::docsBucketLast24hr-result[] ; -docsGettingStartedBucket#[skip:-8.13.99, reason:BUCKET renamed in 8.14] -// tag::gs-bucket[] -FROM sample_data -| STATS BY bucket = BUCKET(@timestamp, 24, "2023-10-23T00:00:00Z", NOW()) -// end::gs-bucket[] -| LIMIT 0 -; - -bucket:date -; - docsGettingStartedBucketStatsBy#[skip:-8.13.99, reason:BUCKET renamed in 8.14] // tag::gs-bucket-stats-by[] FROM sample_data @@ -352,12 +362,15 @@ FROM employees // bucketing in span mode (identical results to above) bucketNumericWithSpan#[skip:-8.13.99, reason:BUCKET extended in 8.14] +//tag::docsBucketNumericWithSpan[] FROM employees | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" | STATS c = COUNT(1) BY b = BUCKET(salary, 5000.) | SORT b +//end::docsBucketNumericWithSpan[] ; +//tag::docsBucketNumericWithSpan-result[] c:long | b:double 1 |25000.0 1 |30000.0 @@ -368,6 +381,7 @@ FROM employees 1 |60000.0 1 |65000.0 1 |70000.0 +//end::docsBucketNumericWithSpan-result[] ; bucketNumericMixedTypes#[skip:-8.13.99, reason:BUCKET extended in 8.14] @@ -439,14 +453,28 @@ FROM employees ; reuseGroupingFunctionWithExpression#[skip:-8.13.99, reason:BUCKET renamed in 8.14] +//tag::reuseGroupingFunctionWithExpression[] FROM employees -| STATS sum = BUCKET(salary % 2 + 13, 1.) + 1 BY bucket = BUCKET(salary % 2 + 13, 1.) -| SORT sum -; - - sum:double | bucket:double -14.0 |13.0 -15.0 |14.0 +| STATS s1 = b1 + 1, s2 = BUCKET(salary / 1000 + 999, 50.) + 2 BY b1 = BUCKET(salary / 100 + 99, 50.), b2 = BUCKET(salary / 1000 + 999, 50.) +| SORT b1, b2 +| KEEP s1, b1, s2, b2 +//end::reuseGroupingFunctionWithExpression[] +; + +//tag::reuseGroupingFunctionWithExpression-result[] + s1:double | b1:double | s2:double | b2:double +351.0 |350.0 |1002.0 |1000.0 +401.0 |400.0 |1002.0 |1000.0 +451.0 |450.0 |1002.0 |1000.0 +501.0 |500.0 |1002.0 |1000.0 +551.0 |550.0 |1002.0 |1000.0 +601.0 |600.0 |1002.0 |1000.0 +601.0 |600.0 |1052.0 |1050.0 +651.0 |650.0 |1052.0 |1050.0 +701.0 |700.0 |1052.0 |1050.0 +751.0 |750.0 |1052.0 |1050.0 +801.0 |800.0 |1052.0 |1050.0 +//end::reuseGroupingFunctionWithExpression-result[] ; reuseGroupingFunctionWithinAggs#[skip:-8.13.99, reason:BUCKET renamed in 8.14] diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Bucket.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Bucket.java index f83c10e4ce1f6..218d469d626f9 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Bucket.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Bucket.java @@ -92,6 +92,10 @@ public class Bucket extends GroupingFunction implements Validatable, TwoOptional examples = { @Example( description = """ + `BUCKET` can work in two modes: one in which the size of the bucket is computed + based on a buckets count recommendation (four parameters) and a range and + another in which the bucket size is provided directly (two parameters). + Using a target number of buckets, a start of a range, and an end of a range, `BUCKET` picks an appropriate bucket size to generate the target number of buckets or fewer. For example, asking for at most 20 buckets over a year results in monthly buckets:""", @@ -102,12 +106,12 @@ public class Bucket extends GroupingFunction implements Validatable, TwoOptional it's to pick a range that people are comfortable with that provides at most the target number of buckets.""" ), @Example( - description = "Combine `BUCKET` with <> to create a histogram:", + description = "Combine `BUCKET` with an <> to create a histogram:", file = "bucket", tag = "docsBucketMonthlyHistogram", explanation = """ NOTE: `BUCKET` does not create buckets that don't match any documents. - + "That's why this example is missing `1985-03-01` and other dates.""" + That's why this example is missing `1985-03-01` and other dates.""" ), @Example( description = """ @@ -120,6 +124,11 @@ public class Bucket extends GroupingFunction implements Validatable, TwoOptional For rows with a value outside of the range, it returns a bucket value that corresponds to a bucket outside the range. Combine`BUCKET` with <> to filter rows.""" ), + @Example(description = """ + If the desired bucket size is known in advance, simply provide it as the second + argument, leaving the range out:""", file = "bucket", tag = "docsBucketWeeklyHistogramWithSpan", explanation = """ + NOTE: When providing the bucket size as the second parameter, its type must be + of a time duration or date period type."""), @Example( description = "`BUCKET` can also operate on numeric fields. For example, to create a salary histogram:", file = "bucket", @@ -128,6 +137,11 @@ public class Bucket extends GroupingFunction implements Validatable, TwoOptional Unlike the earlier example that intentionally filters on a date range, you rarely want to filter on a numeric range. You have to find the `min` and `max` separately. {esql} doesn't yet have an easy way to do that automatically.""" ), + @Example(description = """ + If the desired bucket size is known in advance, simply provide it as the second + argument, leaving the range out:""", file = "bucket", tag = "docsBucketNumericWithSpan", explanation = """ + NOTE: When providing the bucket size as the second parameter, its type must be + of a floating type."""), @Example( description = "Create hourly buckets for the last 24 hours, and calculate the number of events per hour:", file = "bucket", From cfc9e44b63d3a24b7f7b49579547ffe0951f624e Mon Sep 17 00:00:00 2001 From: Simon Cooper Date: Thu, 25 Apr 2024 17:53:13 +0100 Subject: [PATCH 5/8] Reduce parallelism for yaml and java esql rest tests (#107890) This fixes #107879 Reduce parallelism for java rest tests for esql --- x-pack/plugin/esql/qa/server/mixed-cluster/build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/plugin/esql/qa/server/mixed-cluster/build.gradle b/x-pack/plugin/esql/qa/server/mixed-cluster/build.gradle index c25ef858534e0..5515ef0728a72 100644 --- a/x-pack/plugin/esql/qa/server/mixed-cluster/build.gradle +++ b/x-pack/plugin/esql/qa/server/mixed-cluster/build.gradle @@ -40,6 +40,7 @@ BuildParams.bwcVersions.withWireCompatible(supportedVersion) { bwcVersion, baseN usesBwcDistribution(bwcVersion) systemProperty("tests.old_cluster_version", bwcVersion) systemProperty("tests.version_parameter_unsupported", versionUnsupported(bwcVersion)) + maxParallelForks = 1 } def yamlRestTest = tasks.register("v${bwcVersion}#yamlRestTest", StandaloneRestIntegTestTask) { From fdefe090418aed09243fb150548909e3e0c4ff0e Mon Sep 17 00:00:00 2001 From: Panagiotis Bailis Date: Thu, 25 Apr 2024 20:11:44 +0300 Subject: [PATCH 6/8] Fix for from parameter when using sub_searches and rank (#106253) --- docs/changelog/106253.yaml | 6 + docs/reference/search/retriever.asciidoc | 4 +- docs/reference/search/rrf.asciidoc | 86 +- .../action/search/SearchPhaseController.java | 4 + .../action/search/SearchRequest.java | 6 +- .../search/query/QueryPhase.java | 2 +- .../search/rank/RankBuilder.java | 22 +- .../QueryPhaseRankCoordinatorContext.java | 8 +- .../context/QueryPhaseRankShardContext.java | 14 +- .../action/search/SearchRequestTests.java | 15 +- .../search/rank/TestRankBuilder.java | 2 +- .../RRFQueryPhaseRankCoordinatorContext.java | 12 +- .../rrf/RRFQueryPhaseRankShardContext.java | 13 +- .../xpack/rank/rrf/RRFRankBuilder.java | 11 +- .../xpack/rank/rrf/RRFRetrieverBuilder.java | 14 +- .../xpack/rank/rrf/RRFRankBuilderTests.java | 6 +- .../xpack/rank/rrf/RRFRankContextTests.java | 4 +- .../rrf/RRFRetrieverBuilderParsingTests.java | 2 +- .../rest-api-spec/test/rrf/100_rank_rrf.yml | 44 +- .../test/rrf/150_rank_rrf_pagination.yml | 1055 +++++++++++++++++ .../test/rrf/200_rank_rrf_script.yml | 6 +- .../test/rrf/300_rrf_retriever.yml | 8 +- .../test/rrf/400_rrf_retriever_script.yml | 6 +- 23 files changed, 1271 insertions(+), 79 deletions(-) create mode 100644 docs/changelog/106253.yaml create mode 100644 x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/150_rank_rrf_pagination.yml diff --git a/docs/changelog/106253.yaml b/docs/changelog/106253.yaml new file mode 100644 index 0000000000000..b80cda37f63c7 --- /dev/null +++ b/docs/changelog/106253.yaml @@ -0,0 +1,6 @@ +pr: 106253 +summary: Fix for from parameter when using `sub_searches` and rank +area: Ranking +type: bug +issues: + - 99011 diff --git a/docs/reference/search/retriever.asciidoc b/docs/reference/search/retriever.asciidoc index 42d2129f0fdec..6301f439e9b5b 100644 --- a/docs/reference/search/retriever.asciidoc +++ b/docs/reference/search/retriever.asciidoc @@ -189,7 +189,7 @@ GET /index/_search } ], "rank_constant": ... - "window_size": ... + "rank_window_size": ... } } } @@ -202,7 +202,7 @@ The <> and <> parameters are provided globally as part of the general <>. They are applied to all retrievers in a retriever tree unless a specific retriever overrides the `size` parameter -using a different parameter such as `window_size`. Though, the final +using a different parameter such as `rank_window_size`. Though, the final search hits are always limited to `size`. ==== Using aggregations with a retriever tree diff --git a/docs/reference/search/rrf.asciidoc b/docs/reference/search/rrf.asciidoc index 96477cdee45f1..c541750fff789 100644 --- a/docs/reference/search/rrf.asciidoc +++ b/docs/reference/search/rrf.asciidoc @@ -74,7 +74,7 @@ GET example-index/_search } } ], - "window_size": 50, + "rank_window_size": 50, "rank_constant": 20 } } @@ -94,8 +94,8 @@ its global top 50 results. the query top documents and rank them based on the RRF formula using parameters from the `rrf` retriever to get the combined top documents using the default `size` of `10`. -Note that if `k` from a knn search is larger than `window_size`, the results are -truncated to `window_size`. If `k` is smaller than `window_size`, the results are +Note that if `k` from a knn search is larger than `rank_window_size`, the results are +truncated to `rank_window_size`. If `k` is smaller than `rank_window_size`, the results are `k` size. [[rrf-supported-features]] @@ -160,7 +160,7 @@ GET example-index/_search } } ], - "window_size": 50, + "rank_window_size": 50, "rank_constant": 20 } } @@ -289,7 +289,7 @@ GET example-index/_search } } ], - "window_size": 5, + "rank_window_size": 5, "rank_constant": 1 } }, @@ -510,8 +510,82 @@ _id: 5 = 1.0/(1+4) = 0.2000 ---- // NOTCONSOLE -We rank the documents based on the RRF formula with a `window_size` of `5` +We rank the documents based on the RRF formula with a `rank_window_size` of `5` truncating the bottom `2` docs in our RRF result set with a `size` of `3`. We end with `_id: 3` as `_rank: 1`, `_id: 2` as `_rank: 2`, and `_id: 4` as `_rank: 3`. This ranking matches the result set from the original RRF search as expected. + + +==== Pagination in RRF + +When using `rrf` you can paginate through the results using the `from` parameter. +As the final ranking is solely dependent on the original query ranks, to ensure +consistency when paginating, we have to make sure that while `from` changes, the order +of what we have already seen remains intact. To that end, we're using a fixed `rank_window_size` +as the whole available result set upon which we can paginate. +This essentially means that if: + +* `from + size` ≤ `rank_window_size` : we could get `results[from: from+size]` documents back from +the final `rrf` ranked result set + +* `from + size` > `rank_window_size` : we would get 0 results back, as the request would fall outside the +available `rank_window_size`-sized result set. + +An important thing to note here is that since `rank_window_size` is all the results that we'll get to see +from the individual query components, pagination guarantees consistency, i.e. no documents are skipped +or duplicated in multiple pages, iff `rank_window_size` remains the same. If `rank_window_size` changes, then the order +of the results might change as well, even for the same ranks. + +To illustrate all of the above, let's consider the following simplified example where we have +two queries, `queryA` and `queryB` and their ranked documents: +[source,python] +---- + | queryA | queryB | +_id: | 1 | 5 | +_id: | 2 | 4 | +_id: | 3 | 3 | +_id: | 4 | 1 | +_id: | | 2 | +---- +// NOTCONSOLE + +For `rank_window_size=5` we would get to see all documents from both `queryA` and `queryB`. +Assuming a `rank_constant=1`, the `rrf` scores would be: +[source,python] +---- +# doc | queryA | queryB | score +_id: 1 = 1.0/(1+1) + 1.0/(1+4) = 0.7 +_id: 2 = 1.0/(1+2) + 1.0/(1+5) = 0.5 +_id: 3 = 1.0/(1+3) + 1.0/(1+3) = 0.5 +_id: 4 = 1.0/(1+4) + 1.0/(1+2) = 0.533 +_id: 5 = 0 + 1.0/(1+1) = 0.5 +---- +// NOTCONSOLE + +So the final ranked result set would be [`1`, `4`, `2`, `3`, `5`] and we would paginate over that, since +`rank_window_size == len(results)`. In this scenario, we would have: + +* `from=0, size=2` would return documents [`1`, `4`] with ranks `[1, 2]` +* `from=2, size=2` would return documents [`2`, `3`] with ranks `[3, 4]` +* `from=4, size=2` would return document [`5`] with rank `[5]` +* `from=6, size=2` would return an empty result set as it there are no more results to iterate over + +Now, if we had a `rank_window_size=2`, we would only get to see `[1, 2]` and `[5, 4]` documents +for queries `queryA` and `queryB` respectively. Working out the math, we would see that the results would now +be slightly different, because we would have no knowledge of the documents in positions `[3: end]` for either query. +[source,python] +---- +# doc | queryA | queryB | score +_id: 1 = 1.0/(1+1) + 0 = 0.5 +_id: 2 = 1.0/(1+2) + 0 = 0.33 +_id: 4 = 0 + 1.0/(1+2) = 0.33 +_id: 5 = 0 + 1.0/(1+1) = 0.5 +---- +// NOTCONSOLE + +The final ranked result set would be [`1`, `5`, `2`, `4`], and we would be able to paginate +on the top `rank_window_size` results, i.e. [`1`, `5`]. So for the same params as above, we would now have: + +* `from=0, size=2` would return [`1`, `5`] with ranks `[1, 2]` +* `from=2, size=2` would return an empty result set as it would fall outside the available `rank_window_size` results. diff --git a/server/src/main/java/org/elasticsearch/action/search/SearchPhaseController.java b/server/src/main/java/org/elasticsearch/action/search/SearchPhaseController.java index c9b08b5aed4ac..8cc3c6f003fb5 100644 --- a/server/src/main/java/org/elasticsearch/action/search/SearchPhaseController.java +++ b/server/src/main/java/org/elasticsearch/action/search/SearchPhaseController.java @@ -643,6 +643,10 @@ static ReducedQueryPhase reducedQueryPhase( ); sortedTopDocs = new SortedTopDocs(rankedDocs, false, null, null, null, 0); size = sortedTopDocs.scoreDocs.length; + // we need to reset from here as pagination and result trimming has already taken place + // within the `QueryPhaseRankCoordinatorContext#rankQueryPhaseResults` and we don't want + // to apply it again in the `getHits` method. + from = 0; } final TotalHits totalHits = topDocsStats.getTotalHits(); return new ReducedQueryPhase( diff --git a/server/src/main/java/org/elasticsearch/action/search/SearchRequest.java b/server/src/main/java/org/elasticsearch/action/search/SearchRequest.java index a20846ab98a4d..12167c8361513 100644 --- a/server/src/main/java/org/elasticsearch/action/search/SearchRequest.java +++ b/server/src/main/java/org/elasticsearch/action/search/SearchRequest.java @@ -397,10 +397,10 @@ public ActionRequestValidationException validate() { if (size == 0) { validationException = addValidationError("[rank] requires [size] greater than [0]", validationException); } - if (size > source.rankBuilder().windowSize()) { + if (size > source.rankBuilder().rankWindowSize()) { validationException = addValidationError( - "[rank] requires [window_size: " - + source.rankBuilder().windowSize() + "[rank] requires [rank_window_size: " + + source.rankBuilder().rankWindowSize() + "]" + " be greater than or equal to [size: " + size diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java index cd8b5494ac31f..828c6d2b4f3e8 100644 --- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -98,7 +98,7 @@ static void executeRank(SearchContext searchContext) throws QueryPhaseExecutionE RankSearchContext rankSearchContext = new RankSearchContext( searchContext, rankQuery, - queryPhaseRankShardContext.windowSize() + queryPhaseRankShardContext.rankWindowSize() ) ) { QueryPhase.addCollectorsAndSearch(rankSearchContext); diff --git a/server/src/main/java/org/elasticsearch/search/rank/RankBuilder.java b/server/src/main/java/org/elasticsearch/search/rank/RankBuilder.java index e0e04c563a9a8..7118c9f49b36d 100644 --- a/server/src/main/java/org/elasticsearch/search/rank/RankBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/rank/RankBuilder.java @@ -30,22 +30,22 @@ */ public abstract class RankBuilder implements VersionedNamedWriteable, ToXContentObject { - public static final ParseField WINDOW_SIZE_FIELD = new ParseField("window_size"); + public static final ParseField RANK_WINDOW_SIZE_FIELD = new ParseField("rank_window_size"); public static final int DEFAULT_WINDOW_SIZE = SearchService.DEFAULT_SIZE; - private final int windowSize; + private final int rankWindowSize; - public RankBuilder(int windowSize) { - this.windowSize = windowSize; + public RankBuilder(int rankWindowSize) { + this.rankWindowSize = rankWindowSize; } public RankBuilder(StreamInput in) throws IOException { - windowSize = in.readVInt(); + rankWindowSize = in.readVInt(); } public final void writeTo(StreamOutput out) throws IOException { - out.writeVInt(windowSize); + out.writeVInt(rankWindowSize); doWriteTo(out); } @@ -55,7 +55,7 @@ public final void writeTo(StreamOutput out) throws IOException { public final XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); builder.startObject(getWriteableName()); - builder.field(WINDOW_SIZE_FIELD.getPreferredName(), windowSize); + builder.field(RANK_WINDOW_SIZE_FIELD.getPreferredName(), rankWindowSize); doXContent(builder, params); builder.endObject(); builder.endObject(); @@ -64,8 +64,8 @@ public final XContentBuilder toXContent(XContentBuilder builder, Params params) protected abstract void doXContent(XContentBuilder builder, Params params) throws IOException; - public int windowSize() { - return windowSize; + public int rankWindowSize() { + return rankWindowSize; } /** @@ -88,14 +88,14 @@ public final boolean equals(Object obj) { } @SuppressWarnings("unchecked") RankBuilder other = (RankBuilder) obj; - return Objects.equals(windowSize, other.windowSize()) && doEquals(other); + return Objects.equals(rankWindowSize, other.rankWindowSize()) && doEquals(other); } protected abstract boolean doEquals(RankBuilder other); @Override public final int hashCode() { - return Objects.hash(getClass(), windowSize, doHashCode()); + return Objects.hash(getClass(), rankWindowSize, doHashCode()); } protected abstract int doHashCode(); diff --git a/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankCoordinatorContext.java b/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankCoordinatorContext.java index 181122380f22d..1be8544758a8f 100644 --- a/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankCoordinatorContext.java +++ b/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankCoordinatorContext.java @@ -22,15 +22,15 @@ */ public abstract class QueryPhaseRankCoordinatorContext { - protected final int windowSize; + protected final int rankWindowSize; - public QueryPhaseRankCoordinatorContext(int windowSize) { - this.windowSize = windowSize; + public QueryPhaseRankCoordinatorContext(int rankWindowSize) { + this.rankWindowSize = rankWindowSize; } /** * This is used to pull information passed back from the shards as part of {@link QuerySearchResult#getRankShardResult()} - * and return a {@link ScoreDoc[]} of the `window_size` ranked results. Note that {@link TopDocsStats} is included so that + * and return a {@link ScoreDoc[]} of the `rank_window_size` ranked results. Note that {@link TopDocsStats} is included so that * appropriate stats may be updated based on rank results. * This is called when reducing query results through {@code SearchPhaseController#reducedQueryPhase()}. */ diff --git a/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankShardContext.java b/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankShardContext.java index e8bac25009e8c..f562977afb857 100644 --- a/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankShardContext.java +++ b/server/src/main/java/org/elasticsearch/search/rank/context/QueryPhaseRankShardContext.java @@ -22,27 +22,27 @@ public abstract class QueryPhaseRankShardContext { protected final List queries; - protected final int windowSize; + protected final int rankWindowSize; - public QueryPhaseRankShardContext(List queries, int windowSize) { + public QueryPhaseRankShardContext(List queries, int rankWindowSize) { this.queries = queries; - this.windowSize = windowSize; + this.rankWindowSize = rankWindowSize; } public List queries() { return queries; } - public int windowSize() { - return windowSize; + public int rankWindowSize() { + return rankWindowSize; } /** * This is used to reduce the number of required results that are serialized - * to the coordinating node. Normally we would have to serialize {@code queries * window_size} + * to the coordinating node. Normally we would have to serialize {@code queries * rank_window_size} * results, but we can infer that there will likely be overlap of document results. Given that we * know any searches that match the same document must be on the same shard, we can sort on the shard - * instead for a top window_size set of results and reduce the amount of data we serialize. + * instead for a top rank_window_size set of results and reduce the amount of data we serialize. */ public abstract RankShardResult combineQueryPhaseResults(List rankResults); } diff --git a/server/src/test/java/org/elasticsearch/action/search/SearchRequestTests.java b/server/src/test/java/org/elasticsearch/action/search/SearchRequestTests.java index f7f58ef06ccdc..9fd2cd1206ee8 100644 --- a/server/src/test/java/org/elasticsearch/action/search/SearchRequestTests.java +++ b/server/src/test/java/org/elasticsearch/action/search/SearchRequestTests.java @@ -400,7 +400,7 @@ public void testValidate() throws IOException { assertNotNull(validationErrors); assertEquals(1, validationErrors.validationErrors().size()); assertEquals( - "[rank] requires [window_size: 1] be greater than or equal to [size: 2]", + "[rank] requires [rank_window_size: 1] be greater than or equal to [size: 2]", validationErrors.validationErrors().get(0) ); } @@ -437,10 +437,21 @@ public void testValidate() throws IOException { assertNotNull(validationErrors); assertEquals(1, validationErrors.validationErrors().size()); assertEquals( - "[rank] requires [window_size: 9] be greater than or equal to [size: 10]", + "[rank] requires [rank_window_size: 9] be greater than or equal to [size: 10]", validationErrors.validationErrors().get(0) ); } + { + SearchRequest searchRequest = new SearchRequest().source( + new SearchSourceBuilder().rankBuilder(new TestRankBuilder(3)) + .query(QueryBuilders.termQuery("field", "term")) + .knnSearch(List.of(new KnnSearchBuilder("vector", new float[] { 0f }, 10, 100, null))) + .size(3) + .from(4) + ); + ActionRequestValidationException validationErrors = searchRequest.validate(); + assertNull(validationErrors); + } { SearchRequest searchRequest = new SearchRequest().source( new SearchSourceBuilder().rankBuilder(new TestRankBuilder(100)) diff --git a/test/framework/src/main/java/org/elasticsearch/search/rank/TestRankBuilder.java b/test/framework/src/main/java/org/elasticsearch/search/rank/TestRankBuilder.java index 691d541913716..8e2a2c96a31ab 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/rank/TestRankBuilder.java +++ b/test/framework/src/main/java/org/elasticsearch/search/rank/TestRankBuilder.java @@ -35,7 +35,7 @@ public class TestRankBuilder extends RankBuilder { ); static { - PARSER.declareInt(optionalConstructorArg(), WINDOW_SIZE_FIELD); + PARSER.declareInt(optionalConstructorArg(), RANK_WINDOW_SIZE_FIELD); } public static TestRankBuilder fromXContent(XContentParser parser) throws IOException { diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankCoordinatorContext.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankCoordinatorContext.java index 1b3ebe19ce494..b6a1ad52d5d15 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankCoordinatorContext.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankCoordinatorContext.java @@ -62,7 +62,7 @@ public ScoreDoc[] rankQueryPhaseResults(List querySearchResul for (int qi = 0; qi < queryCount; ++qi) { final int fqi = qi; - queues.add(new PriorityQueue<>(windowSize + from) { + queues.add(new PriorityQueue<>(rankWindowSize) { @Override protected boolean lessThan(RRFRankDoc a, RRFRankDoc b) { float score1 = a.scores[fqi]; @@ -105,7 +105,7 @@ protected boolean lessThan(RRFRankDoc a, RRFRankDoc b) { // score if we already saw it as part of a previous query's // doc set, otherwise we make a new doc and calculate the // initial score - Map results = Maps.newMapWithExpectedSize(queryCount * windowSize); + Map results = Maps.newMapWithExpectedSize(queryCount * rankWindowSize); final int fqc = queryCount; for (int qi = 0; qi < queryCount; ++qi) { PriorityQueue queue = queues.get(qi); @@ -127,6 +127,11 @@ protected boolean lessThan(RRFRankDoc a, RRFRankDoc b) { } } + // return if pagination requested is outside the results + if (results.values().size() - from <= 0) { + return new ScoreDoc[0]; + } + // sort the results based on rrf score, tiebreaker based on // larger individual query score from 1 to n, smaller shard then smaller doc id RRFRankDoc[] sortedResults = results.values().toArray(RRFRankDoc[]::new); @@ -151,9 +156,10 @@ protected boolean lessThan(RRFRankDoc a, RRFRankDoc b) { } return rrf1.doc < rrf2.doc ? -1 : 1; }); + // trim results to size RRFRankDoc[] topResults = new RRFRankDoc[Math.min(size, sortedResults.length - from)]; for (int rank = 0; rank < topResults.length; ++rank) { - topResults[rank] = sortedResults[rank]; + topResults[rank] = sortedResults[from + rank]; topResults[rank].rank = rank + 1 + from; } // update fetch hits for the fetch phase, so we gather any additional diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankShardContext.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankShardContext.java index 59307e62872fb..9843b14df6903 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankShardContext.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFQueryPhaseRankShardContext.java @@ -25,11 +25,9 @@ public class RRFQueryPhaseRankShardContext extends QueryPhaseRankShardContext { private final int rankConstant; - private final int from; - public RRFQueryPhaseRankShardContext(List queries, int from, int windowSize, int rankConstant) { - super(queries, windowSize); - this.from = from; + public RRFQueryPhaseRankShardContext(List queries, int rankWindowSize, int rankConstant) { + super(queries, rankWindowSize); this.rankConstant = rankConstant; } @@ -41,7 +39,7 @@ public RRFRankShardResult combineQueryPhaseResults(List rankResults) { // if a doc isn't part of a result set its position will be NO_RANK [0] and // its score is [0f] int queries = rankResults.size(); - Map docsToRankResults = Maps.newMapWithExpectedSize(windowSize); + Map docsToRankResults = Maps.newMapWithExpectedSize(rankWindowSize); int index = 0; for (TopDocs rrfRankResult : rankResults) { int rank = 1; @@ -92,8 +90,9 @@ public RRFRankShardResult combineQueryPhaseResults(List rankResults) { } return rrf1.doc < rrf2.doc ? -1 : 1; }); - // trim the results to window size - RRFRankDoc[] topResults = new RRFRankDoc[Math.min(windowSize + from, sortedResults.length)]; + // trim the results if needed, otherwise each shard will always return `rank_window_size` results. + // pagination and all else will happen on the coordinator when combining the shard responses + RRFRankDoc[] topResults = new RRFRankDoc[Math.min(rankWindowSize, sortedResults.length)]; for (int rank = 0; rank < topResults.length; ++rank) { topResults[rank] = sortedResults[rank]; topResults[rank].rank = rank + 1; diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilder.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilder.java index fee3c7b5e9cf0..8f3ed15037c08 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilder.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilder.java @@ -47,7 +47,7 @@ public class RRFRankBuilder extends RankBuilder { }); static { - PARSER.declareInt(optionalConstructorArg(), WINDOW_SIZE_FIELD); + PARSER.declareInt(optionalConstructorArg(), RANK_WINDOW_SIZE_FIELD); PARSER.declareInt(optionalConstructorArg(), RANK_CONSTANT_FIELD); } @@ -65,8 +65,8 @@ public void doXContent(XContentBuilder builder, Params params) throws IOExceptio private final int rankConstant; - public RRFRankBuilder(int windowSize, int rankConstant) { - super(windowSize); + public RRFRankBuilder(int rankWindowSize, int rankConstant) { + super(rankWindowSize); this.rankConstant = rankConstant; } @@ -94,14 +94,13 @@ public int rankConstant() { return rankConstant; } - @Override public QueryPhaseRankShardContext buildQueryPhaseShardContext(List queries, int from) { - return new RRFQueryPhaseRankShardContext(queries, from, windowSize(), rankConstant); + return new RRFQueryPhaseRankShardContext(queries, rankWindowSize(), rankConstant); } @Override public QueryPhaseRankCoordinatorContext buildQueryPhaseCoordinatorContext(int size, int from) { - return new RRFQueryPhaseRankCoordinatorContext(size, from, windowSize(), rankConstant); + return new RRFQueryPhaseRankCoordinatorContext(size, from, rankWindowSize(), rankConstant); } @Override diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilder.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilder.java index ea8255f73af88..077c933fa9add 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilder.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilder.java @@ -38,7 +38,7 @@ public final class RRFRetrieverBuilder extends RetrieverBuilder { public static final NodeFeature RRF_RETRIEVER_SUPPORTED = new NodeFeature("rrf_retriever_supported"); public static final ParseField RETRIEVERS_FIELD = new ParseField("retrievers"); - public static final ParseField WINDOW_SIZE_FIELD = new ParseField("window_size"); + public static final ParseField RANK_WINDOW_SIZE_FIELD = new ParseField("rank_window_size"); public static final ParseField RANK_CONSTANT_FIELD = new ParseField("rank_constant"); public static final ObjectParser PARSER = new ObjectParser<>( @@ -54,7 +54,7 @@ public final class RRFRetrieverBuilder extends RetrieverBuilder { p.nextToken(); return retrieverBuilder; }, RETRIEVERS_FIELD); - PARSER.declareInt((r, v) -> r.windowSize = v, WINDOW_SIZE_FIELD); + PARSER.declareInt((r, v) -> r.rankWindowSize = v, RANK_WINDOW_SIZE_FIELD); PARSER.declareInt((r, v) -> r.rankConstant = v, RANK_CONSTANT_FIELD); RetrieverBuilder.declareBaseParserFields(NAME, PARSER); @@ -71,7 +71,7 @@ public static RRFRetrieverBuilder fromXContent(XContentParser parser, RetrieverP } List retrieverBuilders = Collections.emptyList(); - int windowSize = RRFRankBuilder.DEFAULT_WINDOW_SIZE; + int rankWindowSize = RRFRankBuilder.DEFAULT_WINDOW_SIZE; int rankConstant = RRFRankBuilder.DEFAULT_RANK_CONSTANT; @Override @@ -88,7 +88,7 @@ public void extractToSearchSourceBuilder(SearchSourceBuilder searchSourceBuilder retrieverBuilder.extractToSearchSourceBuilder(searchSourceBuilder, true); } - searchSourceBuilder.rankBuilder(new RRFRankBuilder(windowSize, rankConstant)); + searchSourceBuilder.rankBuilder(new RRFRankBuilder(rankWindowSize, rankConstant)); } // ---- FOR TESTING XCONTENT PARSING ---- @@ -113,21 +113,21 @@ public void doToXContent(XContentBuilder builder, Params params) throws IOExcept builder.endArray(); } - builder.field(WINDOW_SIZE_FIELD.getPreferredName(), windowSize); + builder.field(RANK_WINDOW_SIZE_FIELD.getPreferredName(), rankWindowSize); builder.field(RANK_CONSTANT_FIELD.getPreferredName(), rankConstant); } @Override public boolean doEquals(Object o) { RRFRetrieverBuilder that = (RRFRetrieverBuilder) o; - return windowSize == that.windowSize + return rankWindowSize == that.rankWindowSize && rankConstant == that.rankConstant && Objects.equals(retrieverBuilders, that.retrieverBuilders); } @Override public int doHashCode() { - return Objects.hash(retrieverBuilders, windowSize, rankConstant); + return Objects.hash(retrieverBuilders, rankWindowSize, rankConstant); } // ---- END FOR TESTING ---- diff --git a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilderTests.java b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilderTests.java index 001857385c5de..7f251416c0a44 100644 --- a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilderTests.java +++ b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankBuilderTests.java @@ -29,9 +29,9 @@ protected RRFRankBuilder createTestInstance() { @Override protected RRFRankBuilder mutateInstance(RRFRankBuilder instance) throws IOException { if (randomBoolean()) { - return new RRFRankBuilder(instance.windowSize(), instance.rankConstant() == 1 ? 2 : instance.rankConstant() - 1); + return new RRFRankBuilder(instance.rankWindowSize(), instance.rankConstant() == 1 ? 2 : instance.rankConstant() - 1); } else { - return new RRFRankBuilder(instance.windowSize() == 0 ? 1 : instance.windowSize() - 1, instance.rankConstant()); + return new RRFRankBuilder(instance.rankWindowSize() == 0 ? 1 : instance.rankWindowSize() - 1, instance.rankConstant()); } } @@ -61,7 +61,7 @@ public void testCreateRankContexts() { List queries = List.of(new TermQuery(new Term("field0", "test0")), new TermQuery(new Term("field1", "test1"))); QueryPhaseRankShardContext queryPhaseRankShardContext = rrfRankBuilder.buildQueryPhaseShardContext(queries, randomInt()); assertEquals(queries, queryPhaseRankShardContext.queries()); - assertEquals(rrfRankBuilder.windowSize(), queryPhaseRankShardContext.windowSize()); + assertEquals(rrfRankBuilder.rankWindowSize(), queryPhaseRankShardContext.rankWindowSize()); assertNotNull(rrfRankBuilder.buildQueryPhaseCoordinatorContext(randomInt(), randomInt())); } diff --git a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankContextTests.java b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankContextTests.java index 50aa1d257d214..61859e280acdf 100644 --- a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankContextTests.java +++ b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRankContextTests.java @@ -35,7 +35,7 @@ private void assertRDEquals(RRFRankDoc rd0, RRFRankDoc rd1) { } public void testShardCombine() { - RRFQueryPhaseRankShardContext context = new RRFQueryPhaseRankShardContext(null, 0, 10, 1); + RRFQueryPhaseRankShardContext context = new RRFQueryPhaseRankShardContext(null, 10, 1); List topDocs = List.of( new TopDocs( null, @@ -266,7 +266,7 @@ public void testCoordinatorRank() { } public void testShardTieBreaker() { - RRFQueryPhaseRankShardContext context = new RRFQueryPhaseRankShardContext(null, 0, 10, 1); + RRFQueryPhaseRankShardContext context = new RRFQueryPhaseRankShardContext(null, 10, 1); List topDocs = List.of( new TopDocs(null, new ScoreDoc[] { new ScoreDoc(1, 10.0f, -1), new ScoreDoc(2, 9.0f, -1) }), diff --git a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilderParsingTests.java b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilderParsingTests.java index d63e8a14b59d5..330c936327b81 100644 --- a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilderParsingTests.java +++ b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/rrf/RRFRetrieverBuilderParsingTests.java @@ -32,7 +32,7 @@ public static RRFRetrieverBuilder createRandomRRFRetrieverBuilder() { RRFRetrieverBuilder rrfRetrieverBuilder = new RRFRetrieverBuilder(); if (randomBoolean()) { - rrfRetrieverBuilder.windowSize = randomIntBetween(1, 10000); + rrfRetrieverBuilder.rankWindowSize = randomIntBetween(1, 10000); } if (randomBoolean()) { diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/100_rank_rrf.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/100_rank_rrf.yml index e55a1897eb701..c9eaa01616175 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/100_rank_rrf.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/100_rank_rrf.yml @@ -75,7 +75,7 @@ setup: text: term rank: rrf: - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -121,7 +121,7 @@ setup: ] rank: rrf: - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -169,7 +169,7 @@ setup: ] rank: rrf: - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -189,3 +189,41 @@ setup: - match: { hits.hits.2._rank: 3 } - match: { hits.hits.2.fields.text.0: "other" } - match: { hits.hits.2.fields.keyword.0: "other" } + + +--- +"RRF rank should fail if size > rank_window_size": + + - do: + catch: "/\\[rank\\] requires \\[rank_window_size: 2\\] be greater than or equal to \\[size: 10\\]/" + search: + index: test + body: + track_total_hits: true + fields: [ "text", "keyword" ] + knn: + field: vector + query_vector: [ 0.0 ] + k: 3 + num_candidates: 3 + sub_searches: [ + { + "query": { + "term": { + "text": "term" + } + } + }, + { + "query": { + "match": { + "keyword": "keyword" + } + } + } + ] + rank: + rrf: + rank_window_size: 2 + rank_constant: 1 + size: 10 diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/150_rank_rrf_pagination.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/150_rank_rrf_pagination.yml new file mode 100644 index 0000000000000..1c950be5bfbf9 --- /dev/null +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/150_rank_rrf_pagination.yml @@ -0,0 +1,1055 @@ +setup: + - skip: + version: ' - 8.14.99' + reason: 'pagination for rrf was added in 8.15' + + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + mappings: + properties: + number_val: + type: keyword + char_val: + type: keyword + + - do: + index: + index: test + id: 1 + body: + number_val: "1" + char_val: "A" + + - do: + index: + index: test + id: 2 + body: + number_val: "2" + char_val: "B" + + - do: + index: + index: test + id: 3 + body: + number_val: "3" + char_val: "C" + + - do: + index: + index: test + id: 4 + body: + number_val: "4" + char_val: "D" + + - do: + index: + index: test + id: 5 + body: + number_val: "5" + char_val: "E" + + - do: + indices.refresh: {} + +--- +"Standard pagination within rank_window_size": + # this test retrieves the same results from two queries, and applies a simple pagination skipping the first result + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "1", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "2", + boost: 9.0 + } + } + }, + { + term: { + number_val: { + value: "3", + boost: 8.0 + } + } + }, + { + term: { + number_val: { + value: "4", + boost: 7.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "A", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "D", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from : 1 + size : 10 + + - match: { hits.total.value : 4 } + - length: { hits.hits : 3 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.0._rank: 2 } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.1._rank: 3 } + - match: { hits.hits.2._id: "4" } + - match: { hits.hits.2._rank: 4 } + +--- +"Standard pagination outside rank_window_size": + # in this example, from starts *after* rank_window_size so, we expect 0 results to be returned + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "1", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "2", + boost: 9.0 + } + } + }, + { + term: { + number_val: { + value: "3", + boost: 8.0 + } + } + }, + { + term: { + number_val: { + value: "4", + boost: 7.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "A", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "D", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 2 + rank_constant: 10 + from : 10 + size : 2 + + - match: { hits.total.value : 4 } + - length: { hits.hits : 0 } + +--- +"Standard pagination partially outside rank_window_size": + # in this example we have that from starts *within* rank_window_size, but "from + size" goes over + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "1", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "2", + boost: 9.0 + } + } + }, + { + term: { + number_val: { + value: "3", + boost: 8.0 + } + } + }, + { + term: { + number_val: { + value: "4", + boost: 7.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "A", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "D", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 3 + rank_constant: 10 + from : 2 + size : 2 + + - match: { hits.total.value : 4 } + - length: { hits.hits : 1 } + - match: { hits.hits.0._id: "3" } + - match: { hits.hits.0._rank: 3 } + + +--- +"Pagination within interleaved results": + # perform two searches with different "from" parameter, ensuring that results are consistent + # rank_window_size covers the entire result set for both queries, so pagination should be consistent + # queryA has a result set of [1, 2, 3, 4] and + # queryB has a result set of [4, 3, 1, 2] + # so for rank_constant=10, the expected order is [1, 4, 3, 2] + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "1", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "2", + boost: 9.0 + } + } + }, + { + term: { + number_val: { + value: "3", + boost: 8.0 + } + } + }, + { + term: { + number_val: { + value: "4", + boost: 7.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from : 0 + size : 2 + + - match: { hits.total.value : 4 } + - length: { hits.hits : 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._rank: 1 } + - match: { hits.hits.1._id: "4" } + - match: { hits.hits.1._rank: 2 } + + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [1, 2, 3, 4] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "1", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "2", + boost: 9.0 + } + } + }, + { + term: { + number_val: { + value: "3", + boost: 8.0 + } + } + }, + { + term: { + number_val: { + value: "4", + boost: 7.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from : 2 + size : 2 + + - match: { hits.total.value : 4 } + - length: { hits.hits : 2 } + - match: { hits.hits.0._id: "3" } + - match: { hits.hits.0._rank: 3 } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.1._rank: 4 } + +--- +"Pagination within interleaved results, different result set sizes, rank_window_size covering all results": + # perform multiple searches with different "from" parameter, ensuring that results are consistent + # rank_window_size covers the entire result set for both queries, so pagination should be consistent + # queryA has a result set of [5, 1] and + # queryB has a result set of [4, 3, 1, 2] + # so for rank_constant=10, the expected order is [1, 5, 4, 3, 2] + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [5, 1] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "5", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "1", + boost: 9.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from : 0 + size : 2 + + - match: { hits.total.value : 5 } + - length: { hits.hits : 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.0._rank: 1 } + - match: { hits.hits.1._id: "5" } + - match: { hits.hits.1._rank: 2 } + + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [5, 1] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "5", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "1", + boost: 9.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from : 2 + size : 2 + + - match: { hits.total.value : 5 } + - length: { hits.hits : 2 } + - match: { hits.hits.0._id: "4" } + - match: { hits.hits.0._rank: 3 } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.1._rank: 4 } + + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [5, 1] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "5", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "1", + boost: 9.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 10 + rank_constant: 10 + from: 4 + size: 2 + + - match: { hits.total.value: 5 } + - length: { hits.hits: 1 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.0._rank: 5 } + + +--- +"Pagination within interleaved results, different result set sizes, rank_window_size not covering all results": + # perform multiple searches with different "from" parameter, ensuring that results are consistent + # rank_window_size does not cover the entire result set for both queries, so the results should be different + # from the test above. More specifically, we'd get to collect 2 results from each query, so we'd have: + # queryA has a result set of [5, 1] and + # queryB has a result set of [4, 3] + # so for rank_constant=10, the expected order is [5, 4, 1, 3], + # and the rank_window_size-sized result set that we'd paginate over is [5, 4] + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [5, 1] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "5", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "1", + boost: 9.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 2 + rank_constant: 10 + from : 0 + size : 2 + + - match: { hits.total.value : 5 } + - length: { hits.hits : 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.0._rank: 1 } + - match: { hits.hits.1._id: "4" } + - match: { hits.hits.1._rank: 2 } + + - do: + search: + index: test + body: + track_total_hits: true + sub_searches: [ + { + # this should clause would generate the result set [5, 1] + "query": { + bool: { + should: [ + { + term: { + number_val: { + value: "5", + boost: 10.0 + } + } + }, + { + term: { + number_val: { + value: "1", + boost: 9.0 + } + } + } + ] + } + } + + }, + { + # this should clause would generate the result set [4, 3, 1, 2] + "query": { + bool: { + should: [ + { + term: { + char_val: { + value: "D", + boost: 10.0 + } + } + }, + { + term: { + char_val: { + value: "C", + boost: 9.0 + } + } + }, + { + term: { + char_val: { + value: "A", + boost: 8.0 + } + } + }, + { + term: { + char_val: { + value: "B", + boost: 7.0 + } + } + } + ] + } + } + + } + ] + rank: + rrf: + rank_window_size: 2 + rank_constant: 10 + from : 2 + size : 2 + + - match: { hits.total.value : 5 } + - length: { hits.hits : 0 } diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/200_rank_rrf_script.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/200_rank_rrf_script.yml index de5b29b21da72..0583e6d7ae51a 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/200_rank_rrf_script.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/200_rank_rrf_script.yml @@ -124,7 +124,7 @@ setup: ] rank: rrf: - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 5 @@ -189,7 +189,7 @@ setup: ] rank: rrf: - window_size: 3 + rank_window_size: 3 rank_constant: 1 size: 1 @@ -267,7 +267,7 @@ setup: ] rank: rrf: - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 5 diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/300_rrf_retriever.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/300_rrf_retriever.yml index 1387c37349cd4..d3d45ef2b18e8 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/300_rrf_retriever.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/300_rrf_retriever.yml @@ -86,7 +86,7 @@ setup: } } ] - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -136,7 +136,7 @@ setup: } } ] - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -191,7 +191,7 @@ setup: } } ] - window_size: 100 + rank_window_size: 100 rank_constant: 1 size: 10 @@ -260,7 +260,7 @@ setup: } } ] - window_size: 2 + rank_window_size: 2 rank_constant: 1 - match: { hits.total.value : 3 } diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/400_rrf_retriever_script.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/400_rrf_retriever_script.yml index 2c2b59f306ee3..520389d51b737 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/400_rrf_retriever_script.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/rrf/400_rrf_retriever_script.yml @@ -138,7 +138,7 @@ setup: } } ] - window_size: 100 + rank_window_size: 100 rank_constant: 1 aggs: sums: @@ -207,7 +207,7 @@ setup: } } ] - window_size: 3 + rank_window_size: 3 rank_constant: 1 aggs: sums: @@ -308,7 +308,7 @@ setup: } } ] - window_size: 100 + rank_window_size: 100 rank_constant: 1 aggs: sums: From 4ef8b3825ec40051ead58d2c6180d6b80be060be Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Thu, 25 Apr 2024 10:27:32 -0700 Subject: [PATCH 7/8] Revert "Format default values of IP ranges to match other range bound" (#107910) This reverts commit acb5139 (a part of #107081). This commit impacts search behaviour for IP range fields and so needs to be reverted. --- .../test/range/20_synthetic_source.yml | 2 +- .../index/mapper/RangeFieldMapper.java | 112 ++++++++---------- .../elasticsearch/index/mapper/RangeType.java | 29 ----- .../index/mapper/IpRangeFieldMapperTests.java | 31 ----- .../index/mapper/RangeFieldMapperTests.java | 4 +- 5 files changed, 54 insertions(+), 124 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/range/20_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/range/20_synthetic_source.yml index 60c61ddbb698e..3551d022c2f4a 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/range/20_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/range/20_synthetic_source.yml @@ -515,7 +515,7 @@ setup: id: "7" - match: _source: - ip_range: { "gte": "0.0.0.0", "lte": "10.10.10.10" } + ip_range: { "gte": "::", "lte": "10.10.10.10" } - do: get: diff --git a/server/src/main/java/org/elasticsearch/index/mapper/RangeFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/RangeFieldMapper.java index 885e0c9f2642d..f84a1b540a2be 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/RangeFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/RangeFieldMapper.java @@ -381,76 +381,66 @@ protected boolean supportsParsingObject() { @Override protected void parseCreateField(DocumentParserContext context) throws IOException { + Range range; XContentParser parser = context.parser(); - if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { - return; - } - - Range range = parseRange(parser); - context.doc().addAll(fieldType().rangeType.createFields(context, name(), range, index, hasDocValues, store)); - - if (hasDocValues == false && (index || store)) { - context.addToFieldNames(fieldType().name()); - } - } - - private Range parseRange(XContentParser parser) throws IOException { final XContentParser.Token start = parser.currentToken(); - if (fieldType().rangeType == RangeType.IP && start == XContentParser.Token.VALUE_STRING) { - return parseIpRangeFromCidr(parser); - } - - if (start != XContentParser.Token.START_OBJECT) { + if (start == XContentParser.Token.VALUE_NULL) { + return; + } else if (start == XContentParser.Token.START_OBJECT) { + RangeFieldType fieldType = fieldType(); + RangeType rangeType = fieldType.rangeType; + String fieldName = null; + Object from = rangeType.minValue(); + Object to = rangeType.maxValue(); + boolean includeFrom = DEFAULT_INCLUDE_LOWER; + boolean includeTo = DEFAULT_INCLUDE_UPPER; + XContentParser.Token token; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else { + if (fieldName.equals(GT_FIELD.getPreferredName())) { + includeFrom = false; + if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { + from = rangeType.parseFrom(fieldType, parser, coerce.value(), includeFrom); + } + } else if (fieldName.equals(GTE_FIELD.getPreferredName())) { + includeFrom = true; + if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { + from = rangeType.parseFrom(fieldType, parser, coerce.value(), includeFrom); + } + } else if (fieldName.equals(LT_FIELD.getPreferredName())) { + includeTo = false; + if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { + to = rangeType.parseTo(fieldType, parser, coerce.value(), includeTo); + } + } else if (fieldName.equals(LTE_FIELD.getPreferredName())) { + includeTo = true; + if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { + to = rangeType.parseTo(fieldType, parser, coerce.value(), includeTo); + } + } else { + throw new DocumentParsingException( + parser.getTokenLocation(), + "error parsing field [" + name() + "], with unknown parameter [" + fieldName + "]" + ); + } + } + } + range = new Range(rangeType, from, to, includeFrom, includeTo); + } else if (fieldType().rangeType == RangeType.IP && start == XContentParser.Token.VALUE_STRING) { + range = parseIpRangeFromCidr(parser); + } else { throw new DocumentParsingException( parser.getTokenLocation(), "error parsing field [" + name() + "], expected an object but got " + parser.currentName() ); } + context.doc().addAll(fieldType().rangeType.createFields(context, name(), range, index, hasDocValues, store)); - RangeFieldType fieldType = fieldType(); - RangeType rangeType = fieldType.rangeType; - String fieldName = null; - Object parsedFrom = null; - Object parsedTo = null; - boolean includeFrom = DEFAULT_INCLUDE_LOWER; - boolean includeTo = DEFAULT_INCLUDE_UPPER; - XContentParser.Token token; - while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { - if (token == XContentParser.Token.FIELD_NAME) { - fieldName = parser.currentName(); - } else { - if (fieldName.equals(GT_FIELD.getPreferredName())) { - includeFrom = false; - if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { - parsedFrom = rangeType.parseFrom(fieldType, parser, coerce.value(), includeFrom); - } - } else if (fieldName.equals(GTE_FIELD.getPreferredName())) { - includeFrom = true; - if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { - parsedFrom = rangeType.parseFrom(fieldType, parser, coerce.value(), includeFrom); - } - } else if (fieldName.equals(LT_FIELD.getPreferredName())) { - includeTo = false; - if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { - parsedTo = rangeType.parseTo(fieldType, parser, coerce.value(), includeTo); - } - } else if (fieldName.equals(LTE_FIELD.getPreferredName())) { - includeTo = true; - if (parser.currentToken() != XContentParser.Token.VALUE_NULL) { - parsedTo = rangeType.parseTo(fieldType, parser, coerce.value(), includeTo); - } - } else { - throw new DocumentParsingException( - parser.getTokenLocation(), - "error parsing field [" + name() + "], with unknown parameter [" + fieldName + "]" - ); - } - } + if (hasDocValues == false && (index || store)) { + context.addToFieldNames(fieldType().name()); } - Object from = parsedFrom != null ? parsedFrom : rangeType.defaultFrom(parsedTo); - Object to = parsedTo != null ? parsedTo : rangeType.defaultTo(parsedFrom); - - return new Range(rangeType, from, to, includeFrom, includeTo); } private static Range parseIpRangeFromCidr(final XContentParser parser) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/RangeType.java b/server/src/main/java/org/elasticsearch/index/mapper/RangeType.java index 24a1eb869cf25..bb4fb1acc0b14 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/RangeType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/RangeType.java @@ -31,7 +31,6 @@ import org.elasticsearch.xcontent.XContentParser; import java.io.IOException; -import java.net.Inet4Address; import java.net.InetAddress; import java.time.Instant; import java.time.ZoneId; @@ -65,26 +64,6 @@ public InetAddress parseTo(RangeFieldMapper.RangeFieldType fieldType, XContentPa return included ? address : nextDown(address); } - public Object defaultFrom(Object parsedTo) { - if (parsedTo == null) { - return minValue(); - } - - // Make sure that we keep the range inside the same address family. - // `minValue()` is always IPv6 so we need to adjust it. - return parsedTo instanceof Inet4Address ? InetAddressPoint.decode(new byte[4]) : minValue(); - } - - public Object defaultTo(Object parsedFrom) { - if (parsedFrom == null) { - return maxValue(); - } - - // Make sure that we keep the range inside the same address family. - // `maxValue()` is always IPv6 so we need to adjust it. - return parsedFrom instanceof Inet4Address ? InetAddressPoint.decode(new byte[] { -1, -1, -1, -1 }) : maxValue(); - } - @Override public InetAddress parseValue(Object value, boolean coerce, @Nullable DateMathParser dateMathParser) { if (value instanceof InetAddress) { @@ -865,14 +844,6 @@ public Object parseTo(RangeFieldMapper.RangeFieldType fieldType, XContentParser return included ? value : (Number) nextDown(value); } - public Object defaultFrom(Object parsedTo) { - return minValue(); - } - - public Object defaultTo(Object parsedFrom) { - return maxValue(); - } - public abstract Object minValue(); public abstract Object maxValue(); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IpRangeFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IpRangeFieldMapperTests.java index 279c9263c98a9..ddec4b8ca65e5 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IpRangeFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IpRangeFieldMapperTests.java @@ -87,37 +87,6 @@ public void testStoreCidr() throws Exception { } } - @Override - public void testNullBounds() throws IOException { - DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> { - minimalMapping(b); - b.field("store", true); - })); - - ParsedDocument bothNull = mapper.parse(source(b -> b.startObject("field").nullField("gte").nullField("lte").endObject())); - assertThat(storedValue(bothNull), equalTo("[:: : ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]")); - - ParsedDocument onlyFromIPv4 = mapper.parse( - source(b -> b.startObject("field").field("gte", rangeValue()).nullField("lte").endObject()) - ); - assertThat(storedValue(onlyFromIPv4), equalTo("[192.168.1.7 : 255.255.255.255]")); - - ParsedDocument onlyToIPv4 = mapper.parse( - source(b -> b.startObject("field").nullField("gte").field("lte", rangeValue()).endObject()) - ); - assertThat(storedValue(onlyToIPv4), equalTo("[0.0.0.0 : 192.168.1.7]")); - - ParsedDocument onlyFromIPv6 = mapper.parse( - source(b -> b.startObject("field").field("gte", "2001:db8::").nullField("lte").endObject()) - ); - assertThat(storedValue(onlyFromIPv6), equalTo("[2001:db8:: : ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]")); - - ParsedDocument onlyToIPv6 = mapper.parse( - source(b -> b.startObject("field").nullField("gte").field("lte", "2001:db8::").endObject()) - ); - assertThat(storedValue(onlyToIPv6), equalTo("[:: : 2001:db8::]")); - } - @SuppressWarnings("unchecked") public void testValidSyntheticSource() throws IOException { CheckedConsumer mapping = b -> { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/RangeFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/RangeFieldMapperTests.java index d87d8dbc2bb4e..54c2d93ab73fa 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/RangeFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/RangeFieldMapperTests.java @@ -236,14 +236,14 @@ private void assertNullBounds(CheckedConsumer toCh } } - protected static String storedValue(ParsedDocument doc) { + private static String storedValue(ParsedDocument doc) { assertEquals(3, doc.rootDoc().getFields("field").size()); List fields = doc.rootDoc().getFields("field"); IndexableField storedField = fields.get(2); return storedField.stringValue(); } - public void testNullBounds() throws IOException { + public final void testNullBounds() throws IOException { // null, null => min, max assertNullBounds(b -> b.startObject("field").nullField("gte").nullField("lte").endObject(), true, true); From e1d902d33ba3773123d651e2f60b51b0e9e95670 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Thu, 25 Apr 2024 10:31:27 -0700 Subject: [PATCH 8/8] Implement synthetic source support for annotated text field (#107735) This PR adds synthetic source support for annotated_text fields. Existing implementation for text is reused including test infrastructure so the majority of the change is moving and making things accessible. Contributes to #106460, #78744. --- docs/changelog/107735.yaml | 5 + docs/plugins/mapper-annotated-text.asciidoc | 161 +++++++++++--- .../mapping/fields/synthetic-source.asciidoc | 1 + .../src/main/java/module-info.java | 19 ++ .../AnnotatedTextFieldMapper.java | 52 ++++- .../index/mapper/annotatedtext/Features.java | 26 +++ ...lasticsearch.features.FeatureSpecification | 9 + .../AnnotatedTextFieldMapperTests.java | 25 ++- .../20_synthetic_source.yml | 197 +++++++++++++++++ .../index/mapper/KeywordFieldMapper.java | 2 +- .../index/mapper/TextFieldMapper.java | 46 ++-- .../index/mapper/KeywordFieldMapperTests.java | 110 +--------- .../index/mapper/TextFieldMapperTests.java | 136 +----------- .../KeywordFieldSyntheticSourceSupport.java | 126 +++++++++++ .../index/mapper/MapperTestCase.java | 2 +- ...xtFieldFamilySyntheticSourceTestSetup.java | 207 ++++++++++++++++++ 16 files changed, 824 insertions(+), 300 deletions(-) create mode 100644 docs/changelog/107735.yaml create mode 100644 plugins/mapper-annotated-text/src/main/java/module-info.java create mode 100644 plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java create mode 100644 plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification create mode 100644 plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml create mode 100644 test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java create mode 100644 test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java diff --git a/docs/changelog/107735.yaml b/docs/changelog/107735.yaml new file mode 100644 index 0000000000000..372cb59ba8b1f --- /dev/null +++ b/docs/changelog/107735.yaml @@ -0,0 +1,5 @@ +pr: 107735 +summary: Implement synthetic source support for annotated text field +area: Mapping +type: feature +issues: [] diff --git a/docs/plugins/mapper-annotated-text.asciidoc b/docs/plugins/mapper-annotated-text.asciidoc index 14669d857817c..900eaa5e97a04 100644 --- a/docs/plugins/mapper-annotated-text.asciidoc +++ b/docs/plugins/mapper-annotated-text.asciidoc @@ -6,7 +6,7 @@ experimental[] The mapper-annotated-text plugin provides the ability to index text that is a combination of free-text and special markup that is typically used to identify items of interest such as people or organisations (see NER or Named Entity Recognition -tools). +tools). The elasticsearch markup allows one or more additional tokens to be injected, unchanged, into the token @@ -18,7 +18,7 @@ include::install_remove.asciidoc[] [[mapper-annotated-text-usage]] ==== Using the `annotated-text` field -The `annotated-text` tokenizes text content as per the more common {ref}/text.html[`text`] field (see +The `annotated-text` tokenizes text content as per the more common {ref}/text.html[`text`] field (see "limitations" below) but also injects any marked-up annotation tokens directly into the search index: @@ -49,7 +49,7 @@ in the search index: -------------------------- GET my-index-000001/_analyze { - "field": "my_field", + "field": "my_field", "text":"Investors in [Apple](Apple+Inc.) rejoiced." } -------------------------- @@ -76,7 +76,7 @@ Response: "position": 1 }, { - "token": "Apple Inc.", <1> + "token": "Apple Inc.", <1> "start_offset": 13, "end_offset": 18, "type": "annotation", @@ -106,7 +106,7 @@ the token stream and at the same position (position 2) as the text token (`apple We can now perform searches for annotations using regular `term` queries that don't tokenize -the provided search values. Annotations are a more precise way of matching as can be seen +the provided search values. Annotations are a more precise way of matching as can be seen in this example where a search for `Beck` will not match `Jeff Beck` : [source,console] @@ -133,18 +133,119 @@ GET my-index-000001/_search } -------------------------- -<1> As well as tokenising the plain text into single words e.g. `beck`, here we +<1> As well as tokenising the plain text into single words e.g. `beck`, here we inject the single token value `Beck` at the same position as `beck` in the token stream. <2> Note annotations can inject multiple tokens at the same position - here we inject both the very specific value `Jeff Beck` and the broader term `Guitarist`. This enables broader positional queries e.g. finding mentions of a `Guitarist` near to `strat`. -<3> A benefit of searching with these carefully defined annotation tokens is that a query for +<3> A benefit of searching with these carefully defined annotation tokens is that a query for `Beck` will not match document 2 that contains the tokens `jeff`, `beck` and `Jeff Beck` -WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will +WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will cause the document to be rejected with a parse failure. In future we hope to have a use for the equals signs so wil actively reject documents that contain this today. +[[annotated-text-synthetic-source]] +===== Synthetic `_source` + +IMPORTANT: Synthetic `_source` is Generally Available only for TSDB indices +(indices that have `index.mode` set to `time_series`). For other indices +synthetic `_source` is in technical preview. Features in technical preview may +be changed or removed in a future release. Elastic will work to fix +any issues, but features in technical preview are not subject to the support SLA +of official GA features. + +`annotated_text` fields support {ref}/mapping-source-field.html#synthetic-source[synthetic `_source`] if they have +a {ref}/keyword.html#keyword-synthetic-source[`keyword`] sub-field that supports synthetic +`_source` or if the `text` field sets `store` to `true`. Either way, it may +not have {ref}/copy-to.html[`copy_to`]. + +If using a sub-`keyword` field then the values are sorted in the same way as +a `keyword` field's values are sorted. By default, that means sorted with +duplicates removed. So: +[source,console,id=synthetic-source-text-example-default] +---- +PUT idx +{ + "mappings": { + "_source": { "mode": "synthetic" }, + "properties": { + "text": { + "type": "annotated_text", + "fields": { + "raw": { + "type": "keyword" + } + } + } + } + } +} +PUT idx/_doc/1 +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/$/\nGET idx\/_doc\/1?filter_path=_source\n/] + +Will become: +[source,console-result] +---- +{ + "text": [ + "jumped over the lazy dog", + "the quick brown fox" + ] +} +---- +// TEST[s/^/{"_source":/ s/\n$/}/] + +NOTE: Reordering text fields can have an effect on {ref}/query-dsl-match-query-phrase.html[phrase] +and {ref}/span-queries.html[span] queries. See the discussion about {ref}/position-increment-gap.html[`position_increment_gap`] for more detail. You +can avoid this by making sure the `slop` parameter on the phrase queries +is lower than the `position_increment_gap`. This is the default. + +If the `annotated_text` field sets `store` to true then order and duplicates +are preserved. +[source,console,id=synthetic-source-text-example-stored] +---- +PUT idx +{ + "mappings": { + "_source": { "mode": "synthetic" }, + "properties": { + "text": { "type": "annotated_text", "store": true } + } + } +} +PUT idx/_doc/1 +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/$/\nGET idx\/_doc\/1?filter_path=_source\n/] + +Will become: +[source,console-result] +---- +{ + "text": [ + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog" + ] +} +---- +// TEST[s/^/{"_source":/ s/\n$/}/] + [[mapper-annotated-text-tips]] ==== Data modelling tips @@ -153,13 +254,13 @@ the equals signs so wil actively reject documents that contain this today. Annotations are normally a way of weaving structured information into unstructured text for higher-precision search. -`Entity resolution` is a form of document enrichment undertaken by specialist software or people +`Entity resolution` is a form of document enrichment undertaken by specialist software or people where references to entities in a document are disambiguated by attaching a canonical ID. The ID is used to resolve any number of aliases or distinguish between people with the -same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved -entity IDs woven into text. +same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved +entity IDs woven into text. -These IDs can be embedded as annotations in an annotated_text field but it often makes +These IDs can be embedded as annotations in an annotated_text field but it often makes sense to include them in dedicated structured fields to support discovery via aggregations: [source,console] @@ -214,20 +315,20 @@ GET my-index-000001/_search -------------------------- <1> Note the `my_twitter_handles` contains a list of the annotation values -also used in the unstructured text. (Note the annotated_text syntax requires escaping). -By repeating the annotation values in a structured field this application has ensured that -the tokens discovered in the structured field can be used for search and highlighting -in the unstructured field. +also used in the unstructured text. (Note the annotated_text syntax requires escaping). +By repeating the annotation values in a structured field this application has ensured that +the tokens discovered in the structured field can be used for search and highlighting +in the unstructured field. <2> In this example we search for documents that talk about components of the elastic stack <3> We use the `my_twitter_handles` field here to discover people who are significantly associated with the elastic stack. ===== Avoiding over-matching annotations -By design, the regular text tokens and the annotation tokens co-exist in the same indexed +By design, the regular text tokens and the annotation tokens co-exist in the same indexed field but in rare cases this can lead to some over-matching. The value of an annotation often denotes a _named entity_ (a person, place or company). -The tokens for these named entities are inserted untokenized, and differ from typical text +The tokens for these named entities are inserted untokenized, and differ from typical text tokens because they are normally: * Mixed case e.g. `Madonna` @@ -235,19 +336,19 @@ tokens because they are normally: * Can have punctuation or numbers e.g. `Apple Inc.` or `@kimchy` This means, for the most part, a search for a named entity in the annotated text field will -not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result -you can drill down to highlight uses in the text without "over matching" on any text tokens +not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result +you can drill down to highlight uses in the text without "over matching" on any text tokens like the word `apple` in this context: the apple was very juicy - -However, a problem arises if your named entity happens to be a single term and lower-case e.g. the + +However, a problem arises if your named entity happens to be a single term and lower-case e.g. the company `elastic`. In this case, a search on the annotated text field for the token `elastic` may match a text document such as this: they fired an elastic band -To avoid such false matches users should consider prefixing annotation values to ensure +To avoid such false matches users should consider prefixing annotation values to ensure they don't name clash with text tokens e.g. [elastic](Company_elastic) released version 7.0 of the elastic stack today @@ -273,7 +374,7 @@ GET my-index-000001/_search { "query": { "query_string": { - "query": "cats" + "query": "cats" } }, "highlight": { @@ -291,21 +392,21 @@ GET my-index-000001/_search The annotated highlighter is based on the `unified` highlighter and supports the same settings but does not use the `pre_tags` or `post_tags` parameters. Rather than using -html-like markup such as `cat` the annotated highlighter uses the same +html-like markup such as `cat` the annotated highlighter uses the same markdown-like syntax used for annotations and injects a key=value annotation where `_hit_term` -is the key and the matched search term is the value e.g. +is the key and the matched search term is the value e.g. The [cat](_hit_term=cat) sat on the [mat](sku3578) -The annotated highlighter tries to be respectful of any existing markup in the original +The annotated highlighter tries to be respectful of any existing markup in the original text: -* If the search term matches exactly the location of an existing annotation then the +* If the search term matches exactly the location of an existing annotation then the `_hit_term` key is merged into the url-like syntax used in the `(...)` part of the -existing annotation. +existing annotation. * However, if the search term overlaps the span of an existing annotation it would break the markup formatting so the original annotation is removed in favour of a new annotation -with just the search hit information in the results. +with just the search hit information in the results. * Any non-overlapping annotations in the original text are preserved in highlighter selections diff --git a/docs/reference/mapping/fields/synthetic-source.asciidoc b/docs/reference/mapping/fields/synthetic-source.asciidoc index ec6f51f78eda5..21e98cd55bf3a 100644 --- a/docs/reference/mapping/fields/synthetic-source.asciidoc +++ b/docs/reference/mapping/fields/synthetic-source.asciidoc @@ -41,6 +41,7 @@ There are a couple of restrictions to be aware of: types: ** <> +** {plugins}/mapper-annotated-text-usage.html#annotated-text-synthetic-source[`annotated-text`] ** <> ** <> ** <> diff --git a/plugins/mapper-annotated-text/src/main/java/module-info.java b/plugins/mapper-annotated-text/src/main/java/module-info.java new file mode 100644 index 0000000000000..3aa8e46e2980c --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/module-info.java @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +module org.elasticsearch.index.mapper.annotatedtext { + requires org.elasticsearch.base; + requires org.elasticsearch.server; + requires org.elasticsearch.xcontent; + requires org.apache.lucene.core; + requires org.apache.lucene.highlighter; + + // exports nothing + + provides org.elasticsearch.features.FeatureSpecification with org.elasticsearch.index.mapper.annotatedtext.Features; +} diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java index fae2ab19aee39..6d2b83185d5b7 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java @@ -21,17 +21,22 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.DocumentParserContext; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.SourceLoader; +import org.elasticsearch.index.mapper.StringStoredFieldFieldLoader; import org.elasticsearch.index.mapper.TextFieldMapper; import org.elasticsearch.index.mapper.TextParams; import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.similarity.SimilarityProvider; +import org.elasticsearch.xcontent.XContentBuilder; import java.io.IOException; import java.io.Reader; @@ -41,6 +46,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -58,6 +64,8 @@ **/ public class AnnotatedTextFieldMapper extends FieldMapper { + public static final NodeFeature SYNTHETIC_SOURCE_SUPPORT = new NodeFeature("mapper.annotated_text.synthetic_source"); + public static final String CONTENT_TYPE = "annotated_text"; private static Builder builder(FieldMapper in) { @@ -114,7 +122,7 @@ protected Parameter[] getParameters() { meta }; } - private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context) { + private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context, MultiFields multiFields) { TextSearchInfo tsi = new TextSearchInfo( fieldType, similarity.get(), @@ -126,12 +134,14 @@ private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilder store.getValue(), tsi, context.isSourceSynthetic(), + TextFieldMapper.SyntheticSourceHelper.syntheticSourceDelegate(fieldType, multiFields), meta.getValue() ); } @Override public AnnotatedTextFieldMapper build(MapperBuilderContext context) { + MultiFields multiFields = multiFieldsBuilder.build(this, context); FieldType fieldType = TextParams.buildFieldType(() -> true, store, indexOptions, norms, termVectors); if (fieldType.indexOptions() == IndexOptions.NONE) { throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields must be indexed"); @@ -146,8 +156,8 @@ public AnnotatedTextFieldMapper build(MapperBuilderContext context) { return new AnnotatedTextFieldMapper( name(), fieldType, - buildFieldType(fieldType, context), - multiFieldsBuilder.build(this, context), + buildFieldType(fieldType, context, multiFields), + multiFields, copyTo, this ); @@ -472,15 +482,15 @@ private void emitAnnotation(int firstSpannedTextPosInc, int annotationPosLen) th } public static final class AnnotatedTextFieldType extends TextFieldMapper.TextFieldType { - private AnnotatedTextFieldType( String name, boolean store, TextSearchInfo tsi, boolean isSyntheticSource, + KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate, Map meta ) { - super(name, true, store, tsi, isSyntheticSource, null, meta, false, false); + super(name, true, store, tsi, isSyntheticSource, syntheticSourceDelegate, meta, false, false); } public AnnotatedTextFieldType(String name, Map meta) { @@ -544,4 +554,36 @@ protected String contentType() { public FieldMapper.Builder getMergeBuilder() { return new Builder(simpleName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers).init(this); } + + @Override + public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() { + if (copyTo.copyToFields().isEmpty() != true) { + throw new IllegalArgumentException( + "field [" + name() + "] of type [" + typeName() + "] doesn't support synthetic source because it declares copy_to" + ); + } + if (fieldType.stored()) { + return new StringStoredFieldFieldLoader(name(), simpleName(), null) { + @Override + protected void write(XContentBuilder b, Object value) throws IOException { + b.value((String) value); + } + }; + } + + var kwd = TextFieldMapper.SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this); + if (kwd != null) { + return kwd.syntheticFieldLoader(simpleName()); + } + + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "field [%s] of type [%s] doesn't support synthetic source unless it is stored or has a sub-field of" + + " type [keyword] with doc values or stored and without a normalizer", + name(), + typeName() + ) + ); + } } diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java new file mode 100644 index 0000000000000..1c4bd22e88145 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/Features.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper.annotatedtext; + +import org.elasticsearch.features.FeatureSpecification; +import org.elasticsearch.features.NodeFeature; + +import java.util.Set; + +/** + * Provides features for annotated text mapper. + */ +public class Features implements FeatureSpecification { + @Override + public Set getFeatures() { + return Set.of( + AnnotatedTextFieldMapper.SYNTHETIC_SOURCE_SUPPORT // Added in 8.15 + ); + } +} diff --git a/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification b/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification new file mode 100644 index 0000000000000..a19d9deb9c522 --- /dev/null +++ b/plugins/mapper-annotated-text/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification @@ -0,0 +1,9 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the Server Side Public License, v 1; you may not use this file except +# in compliance with, at your election, the Elastic License 2.0 or the Server +# Side Public License, v 1. +# + +org.elasticsearch.index.mapper.annotatedtext.Features diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java index 9f1d063433d88..3b27cdb132851 100644 --- a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java @@ -14,6 +14,7 @@ import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; @@ -29,6 +30,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.LowercaseNormalizer; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -38,6 +40,7 @@ import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.mapper.TextFieldFamilySyntheticSourceTestSetup; import org.elasticsearch.index.mapper.TextFieldMapper; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.xcontent.ToXContent; @@ -54,6 +57,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -144,7 +148,8 @@ public TokenStream create(TokenStream tokenStream) { ) ); return IndexAnalyzers.of( - Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop) + Map.of("default", dflt, "standard", standard, "keyword", keyword, "whitespace", whitespace, "my_stop_analyzer", stop), + Map.of("lowercase", new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer())) ); } @@ -595,7 +600,23 @@ protected boolean supportsIgnoreMalformed() { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { - throw new AssumptionViolatedException("not supported"); + assumeFalse("ignore_malformed not supported", ignoreMalformed); + return TextFieldFamilySyntheticSourceTestSetup.syntheticSourceSupport("annotated_text", false); + } + + @Override + protected BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { + return TextFieldFamilySyntheticSourceTestSetup.getSupportedReaders(mapper, loaderFieldName); + } + + @Override + protected Function loadBlockExpected(BlockReaderSupport blockReaderSupport, boolean columnReader) { + return TextFieldFamilySyntheticSourceTestSetup.loadBlockExpected(blockReaderSupport, columnReader); + } + + @Override + protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + TextFieldFamilySyntheticSourceTestSetup.validateRoundTripReader(syntheticSource, reader, roundTripReader); } @Override diff --git a/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml b/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml new file mode 100644 index 0000000000000..54a51e60f56df --- /dev/null +++ b/plugins/mapper-annotated-text/src/yamlRestTest/resources/rest-api-spec/test/mapper_annotatedtext/20_synthetic_source.yml @@ -0,0 +1,197 @@ +--- +setup: + - requires: + cluster_features: ["mapper.annotated_text.synthetic_source"] + reason: introduced in 8.15.0 + +--- +stored annotated_text field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: the quick brown fox + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: the quick brown fox + +--- +annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: the quick brown fox + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: the quick brown fox + +--- +multiple values in stored annotated_text field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] + +--- +multiple values in annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["hello", "world"] + + +--- +multiple values in annotated_text field with stored keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + fields: + keyword: + type: keyword + store: true + doc_values: false + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] + +--- +multiple values in stored annotated_text field with keyword multi-field: + - do: + indices.create: + index: test + body: + mappings: + _source: + mode: synthetic + properties: + annotated_text: + type: annotated_text + store: true + fields: + keyword: + type: keyword + + - do: + index: + index: test + id: 1 + refresh: true + body: + annotated_text: ["world", "hello", "world"] + + - do: + search: + index: test + + - match: + hits.hits.0._source: + annotated_text: ["world", "hello", "world"] diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index bdf25307d3343..eeb452204091d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -1026,7 +1026,7 @@ public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() { return syntheticFieldLoader(simpleName()); } - SourceLoader.SyntheticFieldLoader syntheticFieldLoader(String simpleName) { + public SourceLoader.SyntheticFieldLoader syntheticFieldLoader(String simpleName) { if (hasScript()) { return SourceLoader.SyntheticFieldLoader.NOTHING; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index ef512e2bbd46b..57dd2fa0b920d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -390,7 +390,7 @@ private TextFieldType buildFieldType( store.getValue(), tsi, context.isSourceSynthetic(), - syntheticSourceDelegate(fieldType, multiFields), + SyntheticSourceHelper.syntheticSourceDelegate(fieldType, multiFields), meta.getValue(), eagerGlobalOrdinals.getValue(), indexPhrases.getValue() @@ -402,17 +402,6 @@ private TextFieldType buildFieldType( return ft; } - private static KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate(FieldType fieldType, MultiFields multiFields) { - if (fieldType.stored()) { - return null; - } - var kwd = getKeywordFieldMapperForSyntheticSource(multiFields); - if (kwd != null) { - return kwd.fieldType(); - } - return null; - } - private SubFieldInfo buildPrefixInfo(MapperBuilderContext context, FieldType fieldType, TextFieldType tft) { if (indexPrefixes.get() == null) { return null; @@ -1094,7 +1083,7 @@ public boolean isSyntheticSource() { return isSyntheticSource; } - KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate() { + public KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate() { return syntheticSourceDelegate; } } @@ -1473,7 +1462,7 @@ protected void write(XContentBuilder b, Object value) throws IOException { }; } - var kwd = getKeywordFieldMapperForSyntheticSource(this); + var kwd = SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this); if (kwd != null) { return kwd.syntheticFieldLoader(simpleName()); } @@ -1489,16 +1478,29 @@ protected void write(XContentBuilder b, Object value) throws IOException { ); } - private static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterable multiFields) { - for (Mapper sub : multiFields) { - if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { - KeywordFieldMapper kwd = (KeywordFieldMapper) sub; - if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { - return kwd; - } + public static class SyntheticSourceHelper { + public static KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate(FieldType fieldType, MultiFields multiFields) { + if (fieldType.stored()) { + return null; + } + var kwd = getKeywordFieldMapperForSyntheticSource(multiFields); + if (kwd != null) { + return kwd.fieldType(); } + return null; } - return null; + public static KeywordFieldMapper getKeywordFieldMapperForSyntheticSource(Iterable multiFields) { + for (Mapper sub : multiFields) { + if (sub.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { + KeywordFieldMapper kwd = (KeywordFieldMapper) sub; + if (kwd.hasNormalizer() == false && (kwd.fieldType().hasDocValues() || kwd.fieldType().isStored())) { + return kwd; + } + } + } + + return null; + } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 70e375a89d5e7..4824bd337f5b0 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -25,7 +25,6 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.core.Tuple; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.analysis.AnalyzerScope; @@ -45,14 +44,11 @@ import org.elasticsearch.xcontent.XContentBuilder; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.function.Function; -import java.util.stream.Collectors; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -658,7 +654,7 @@ protected Function loadBlockExpected() { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { assertFalse("keyword doesn't support ignore_malformed", ignoreMalformed); - return new KeywordSyntheticSourceSupport( + return new KeywordFieldSyntheticSourceSupport( randomBoolean() ? null : between(10, 100), randomBoolean(), usually() ? null : randomAlphaOfLength(2), @@ -666,110 +662,6 @@ protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) ); } - static class KeywordSyntheticSourceSupport implements SyntheticSourceSupport { - private final Integer ignoreAbove; - private final boolean allIgnored; - private final boolean store; - private final boolean docValues; - private final String nullValue; - private final boolean exampleSortsUsingIgnoreAbove; - - KeywordSyntheticSourceSupport(Integer ignoreAbove, boolean store, String nullValue, boolean exampleSortsUsingIgnoreAbove) { - this.ignoreAbove = ignoreAbove; - this.allIgnored = ignoreAbove != null && rarely(); - this.store = store; - this.nullValue = nullValue; - this.exampleSortsUsingIgnoreAbove = exampleSortsUsingIgnoreAbove; - this.docValues = store ? randomBoolean() : true; - } - - @Override - public SyntheticSourceExample example(int maxValues) { - return example(maxValues, false); - } - - public SyntheticSourceExample example(int maxValues, boolean loadBlockFromSource) { - if (randomBoolean()) { - Tuple v = generateValue(); - Object loadBlock = v.v2(); - if (loadBlockFromSource == false && ignoreAbove != null && v.v2().length() > ignoreAbove) { - loadBlock = null; - } - return new SyntheticSourceExample(v.v1(), v.v2(), loadBlock, this::mapping); - } - List> values = randomList(1, maxValues, this::generateValue); - List in = values.stream().map(Tuple::v1).toList(); - List outPrimary = new ArrayList<>(); - List outExtraValues = new ArrayList<>(); - values.stream().map(Tuple::v2).forEach(v -> { - if (exampleSortsUsingIgnoreAbove && ignoreAbove != null && v.length() > ignoreAbove) { - outExtraValues.add(v); - } else { - outPrimary.add(v); - } - }); - List outList = store ? outPrimary : new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); - List loadBlock; - if (loadBlockFromSource) { - // The block loader infrastructure will never return nulls. Just zap them all. - loadBlock = in.stream().filter(m -> m != null).toList(); - } else if (docValues) { - loadBlock = new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); - } else { - loadBlock = List.copyOf(outList); - } - Object loadBlockResult = loadBlock.size() == 1 ? loadBlock.get(0) : loadBlock; - outList.addAll(outExtraValues); - Object out = outList.size() == 1 ? outList.get(0) : outList; - return new SyntheticSourceExample(in, out, loadBlockResult, this::mapping); - } - - private Tuple generateValue() { - if (nullValue != null && randomBoolean()) { - return Tuple.tuple(null, nullValue); - } - int length = 5; - if (ignoreAbove != null && (allIgnored || randomBoolean())) { - length = ignoreAbove + 5; - } - String v = randomAlphaOfLength(length); - return Tuple.tuple(v, v); - } - - private void mapping(XContentBuilder b) throws IOException { - b.field("type", "keyword"); - if (nullValue != null) { - b.field("null_value", nullValue); - } - if (ignoreAbove != null) { - b.field("ignore_above", ignoreAbove); - } - if (store) { - b.field("store", true); - } - if (docValues == false) { - b.field("doc_values", false); - } - } - - @Override - public List invalidExample() throws IOException { - return List.of( - new SyntheticSourceInvalidExample( - equalTo( - "field [field] of type [keyword] doesn't support synthetic source because " - + "it doesn't have doc values and isn't stored" - ), - b -> b.field("type", "keyword").field("doc_values", false) - ), - new SyntheticSourceInvalidExample( - equalTo("field [field] of type [keyword] doesn't support synthetic source because it declares a normalizer"), - b -> b.field("type", "keyword").field("normalizer", "lowercase") - ) - ); - } - } - @Override protected IngestScriptSupport ingestScriptSupport() { return new IngestScriptSupport() { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java index 5d0c1c01ecdcf..50d15be2256ed 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java @@ -75,7 +75,6 @@ import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; -import org.hamcrest.Matcher; import org.junit.AssumptionViolatedException; import java.io.IOException; @@ -1178,120 +1177,12 @@ protected boolean supportsIgnoreMalformed() { @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { assumeFalse("ignore_malformed not supported", ignoreMalformed); - boolean storeTextField = randomBoolean(); - boolean storedKeywordField = storeTextField || randomBoolean(); - boolean indexText = randomBoolean(); - Integer ignoreAbove = randomBoolean() ? null : between(10, 100); - KeywordFieldMapperTests.KeywordSyntheticSourceSupport keywordSupport = new KeywordFieldMapperTests.KeywordSyntheticSourceSupport( - ignoreAbove, - storedKeywordField, - null, - false == storeTextField - ); - return new SyntheticSourceSupport() { - @Override - public SyntheticSourceExample example(int maxValues) { - if (storeTextField) { - SyntheticSourceExample delegate = keywordSupport.example(maxValues, true); - return new SyntheticSourceExample( - delegate.inputValue(), - delegate.expectedForSyntheticSource(), - delegate.expectedForBlockLoader(), - b -> { - b.field("type", "text"); - b.field("store", true); - if (indexText == false) { - b.field("index", false); - } - } - ); - } - // We'll load from _source if ignore_above is defined, otherwise we load from the keyword field. - boolean loadingFromSource = ignoreAbove != null; - SyntheticSourceExample delegate = keywordSupport.example(maxValues, loadingFromSource); - return new SyntheticSourceExample( - delegate.inputValue(), - delegate.expectedForSyntheticSource(), - delegate.expectedForBlockLoader(), - b -> { - b.field("type", "text"); - if (indexText == false) { - b.field("index", false); - } - b.startObject("fields"); - { - b.startObject(randomAlphaOfLength(4)); - delegate.mapping().accept(b); - b.endObject(); - } - b.endObject(); - } - ); - } - - @Override - public List invalidExample() throws IOException { - Matcher err = equalTo( - "field [field] of type [text] doesn't support synthetic source unless it is stored or" - + " has a sub-field of type [keyword] with doc values or stored and without a normalizer" - ); - return List.of( - new SyntheticSourceInvalidExample(err, TextFieldMapperTests.this::minimalMapping), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("l"); - b.field("type", "long"); - b.endObject(); - } - b.endObject(); - }), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("kwd"); - b.field("type", "keyword"); - b.field("normalizer", "lowercase"); - b.endObject(); - } - b.endObject(); - }), - new SyntheticSourceInvalidExample(err, b -> { - b.field("type", "text"); - b.startObject("fields"); - { - b.startObject("kwd"); - b.field("type", "keyword"); - b.field("doc_values", "false"); - b.endObject(); - } - b.endObject(); - }) - ); - } - }; + return TextFieldFamilySyntheticSourceTestSetup.syntheticSourceSupport("text", true); } @Override protected Function loadBlockExpected(BlockReaderSupport blockReaderSupport, boolean columnReader) { - if (nullLoaderExpected(blockReaderSupport.mapper(), blockReaderSupport.loaderFieldName())) { - return null; - } - return v -> ((BytesRef) v).utf8ToString(); - } - - private boolean nullLoaderExpected(MapperService mapper, String fieldName) { - MappedFieldType type = mapper.fieldType(fieldName); - if (type instanceof TextFieldType t) { - if (t.isSyntheticSource() == false || t.canUseSyntheticSourceDelegateForQuerying() || t.isStored()) { - return false; - } - String parentField = mapper.mappingLookup().parentField(fieldName); - return parentField == null || nullLoaderExpected(mapper, parentField); - } - return false; + return TextFieldFamilySyntheticSourceTestSetup.loadBlockExpected(blockReaderSupport, columnReader); } @Override @@ -1300,9 +1191,8 @@ protected IngestScriptSupport ingestScriptSupport() { } @Override - protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) - throws IOException { - // Disabled because it currently fails + protected void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + TextFieldFamilySyntheticSourceTestSetup.validateRoundTripReader(syntheticSource, reader, roundTripReader); } public void testUnknownAnalyzerOnLegacyIndex() throws IOException { @@ -1433,21 +1323,7 @@ public void testEmpty() throws Exception { @Override protected BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { - MappedFieldType ft = mapper.fieldType(loaderFieldName); - String parentName = mapper.mappingLookup().parentField(ft.name()); - if (parentName == null) { - TextFieldMapper.TextFieldType text = (TextFieldType) ft; - boolean supportsColumnAtATimeReader = text.syntheticSourceDelegate() != null - && text.syntheticSourceDelegate().hasDocValues() - && text.canUseSyntheticSourceDelegateForQuerying(); - return new BlockReaderSupport(supportsColumnAtATimeReader, mapper, loaderFieldName); - } - MappedFieldType parent = mapper.fieldType(parentName); - if (false == parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { - throw new UnsupportedOperationException(); - } - KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; - return new BlockReaderSupport(kwd.hasDocValues(), mapper, loaderFieldName); + return TextFieldFamilySyntheticSourceTestSetup.getSupportedReaders(mapper, loaderFieldName); } public void testBlockLoaderFromParentColumnReader() throws IOException { @@ -1460,7 +1336,7 @@ public void testBlockLoaderParentFromRowStrideReader() throws IOException { private void testBlockLoaderFromParent(boolean columnReader, boolean syntheticSource) throws IOException { boolean storeParent = randomBoolean(); - KeywordFieldMapperTests.KeywordSyntheticSourceSupport kwdSupport = new KeywordFieldMapperTests.KeywordSyntheticSourceSupport( + KeywordFieldSyntheticSourceSupport kwdSupport = new KeywordFieldSyntheticSourceSupport( null, storeParent, null, diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java new file mode 100644 index 0000000000000..53ecb75c18d9a --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/KeywordFieldSyntheticSourceSupport.java @@ -0,0 +1,126 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.equalTo; + +public class KeywordFieldSyntheticSourceSupport implements MapperTestCase.SyntheticSourceSupport { + private final Integer ignoreAbove; + private final boolean allIgnored; + private final boolean store; + private final boolean docValues; + private final String nullValue; + private final boolean exampleSortsUsingIgnoreAbove; + + KeywordFieldSyntheticSourceSupport(Integer ignoreAbove, boolean store, String nullValue, boolean exampleSortsUsingIgnoreAbove) { + this.ignoreAbove = ignoreAbove; + this.allIgnored = ignoreAbove != null && LuceneTestCase.rarely(); + this.store = store; + this.nullValue = nullValue; + this.exampleSortsUsingIgnoreAbove = exampleSortsUsingIgnoreAbove; + this.docValues = store ? ESTestCase.randomBoolean() : true; + } + + @Override + public MapperTestCase.SyntheticSourceExample example(int maxValues) { + return example(maxValues, false); + } + + public MapperTestCase.SyntheticSourceExample example(int maxValues, boolean loadBlockFromSource) { + if (ESTestCase.randomBoolean()) { + Tuple v = generateValue(); + Object loadBlock = v.v2(); + if (loadBlockFromSource == false && ignoreAbove != null && v.v2().length() > ignoreAbove) { + loadBlock = null; + } + return new MapperTestCase.SyntheticSourceExample(v.v1(), v.v2(), loadBlock, this::mapping); + } + List> values = ESTestCase.randomList(1, maxValues, this::generateValue); + List in = values.stream().map(Tuple::v1).toList(); + List outPrimary = new ArrayList<>(); + List outExtraValues = new ArrayList<>(); + values.stream().map(Tuple::v2).forEach(v -> { + if (exampleSortsUsingIgnoreAbove && ignoreAbove != null && v.length() > ignoreAbove) { + outExtraValues.add(v); + } else { + outPrimary.add(v); + } + }); + List outList = store ? outPrimary : new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); + List loadBlock; + if (loadBlockFromSource) { + // The block loader infrastructure will never return nulls. Just zap them all. + loadBlock = in.stream().filter(m -> m != null).toList(); + } else if (docValues) { + loadBlock = new HashSet<>(outPrimary).stream().sorted().collect(Collectors.toList()); + } else { + loadBlock = List.copyOf(outList); + } + Object loadBlockResult = loadBlock.size() == 1 ? loadBlock.get(0) : loadBlock; + outList.addAll(outExtraValues); + Object out = outList.size() == 1 ? outList.get(0) : outList; + return new MapperTestCase.SyntheticSourceExample(in, out, loadBlockResult, this::mapping); + } + + private Tuple generateValue() { + if (nullValue != null && ESTestCase.randomBoolean()) { + return Tuple.tuple(null, nullValue); + } + int length = 5; + if (ignoreAbove != null && (allIgnored || ESTestCase.randomBoolean())) { + length = ignoreAbove + 5; + } + String v = ESTestCase.randomAlphaOfLength(length); + return Tuple.tuple(v, v); + } + + private void mapping(XContentBuilder b) throws IOException { + b.field("type", "keyword"); + if (nullValue != null) { + b.field("null_value", nullValue); + } + if (ignoreAbove != null) { + b.field("ignore_above", ignoreAbove); + } + if (store) { + b.field("store", true); + } + if (docValues == false) { + b.field("doc_values", false); + } + } + + @Override + public List invalidExample() throws IOException { + return List.of( + new MapperTestCase.SyntheticSourceInvalidExample( + equalTo( + "field [field] of type [keyword] doesn't support synthetic source because " + + "it doesn't have doc values and isn't stored" + ), + b -> b.field("type", "keyword").field("doc_values", false) + ), + new MapperTestCase.SyntheticSourceInvalidExample( + equalTo("field [field] of type [keyword] doesn't support synthetic source because it declares a normalizer"), + b -> b.field("type", "keyword").field("normalizer", "lowercase") + ) + ); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java index fa0f0e1b95f54..097c23b96bb76 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/MapperTestCase.java @@ -1286,7 +1286,7 @@ protected BlockReaderSupport getSupportedReaders(MapperService mapper, String lo * @param loaderFieldName the field name to use for loading the field */ public record BlockReaderSupport(boolean columnAtATimeReader, boolean syntheticSource, MapperService mapper, String loaderFieldName) { - BlockReaderSupport(boolean columnAtATimeReader, MapperService mapper, String loaderFieldName) { + public BlockReaderSupport(boolean columnAtATimeReader, MapperService mapper, String loaderFieldName) { this(columnAtATimeReader, true, mapper, loaderFieldName); } diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java new file mode 100644 index 0000000000000..df4377adc3e35 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/TextFieldFamilySyntheticSourceTestSetup.java @@ -0,0 +1,207 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.util.BytesRef; +import org.hamcrest.Matcher; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import java.util.function.Function; + +import static org.elasticsearch.test.ESTestCase.between; +import static org.elasticsearch.test.ESTestCase.randomAlphaOfLength; +import static org.elasticsearch.test.ESTestCase.randomBoolean; +import static org.hamcrest.Matchers.equalTo; + +/** + * Provides functionality needed to test synthetic source support in text and text-like fields (e.g. "text", "annotated_text"). + */ +public final class TextFieldFamilySyntheticSourceTestSetup { + public static MapperTestCase.SyntheticSourceSupport syntheticSourceSupport(String fieldType, boolean supportsCustomIndexConfiguration) { + return new TextFieldFamilySyntheticSourceSupport(fieldType, supportsCustomIndexConfiguration); + } + + public static MapperTestCase.BlockReaderSupport getSupportedReaders(MapperService mapper, String loaderFieldName) { + MappedFieldType ft = mapper.fieldType(loaderFieldName); + String parentName = mapper.mappingLookup().parentField(ft.name()); + if (parentName == null) { + TextFieldMapper.TextFieldType text = (TextFieldMapper.TextFieldType) ft; + boolean supportsColumnAtATimeReader = text.syntheticSourceDelegate() != null + && text.syntheticSourceDelegate().hasDocValues() + && text.canUseSyntheticSourceDelegateForQuerying(); + return new MapperTestCase.BlockReaderSupport(supportsColumnAtATimeReader, mapper, loaderFieldName); + } + MappedFieldType parent = mapper.fieldType(parentName); + if (false == parent.typeName().equals(KeywordFieldMapper.CONTENT_TYPE)) { + throw new UnsupportedOperationException(); + } + KeywordFieldMapper.KeywordFieldType kwd = (KeywordFieldMapper.KeywordFieldType) parent; + return new MapperTestCase.BlockReaderSupport(kwd.hasDocValues(), mapper, loaderFieldName); + } + + public static Function loadBlockExpected(MapperTestCase.BlockReaderSupport blockReaderSupport, boolean columnReader) { + if (nullLoaderExpected(blockReaderSupport.mapper(), blockReaderSupport.loaderFieldName())) { + return null; + } + return v -> ((BytesRef) v).utf8ToString(); + } + + private static boolean nullLoaderExpected(MapperService mapper, String fieldName) { + MappedFieldType type = mapper.fieldType(fieldName); + if (type instanceof TextFieldMapper.TextFieldType t) { + if (t.isSyntheticSource() == false || t.canUseSyntheticSourceDelegateForQuerying() || t.isStored()) { + return false; + } + String parentField = mapper.mappingLookup().parentField(fieldName); + return parentField == null || nullLoaderExpected(mapper, parentField); + } + return false; + } + + public static void validateRoundTripReader(String syntheticSource, DirectoryReader reader, DirectoryReader roundTripReader) { + // `reader` here is reader of original document and `roundTripReader` reads document + // created from synthetic source. + // This check fails when synthetic source is constructed using keyword subfield + // since in that case values are sorted (due to being read from doc values) but original document isn't. + // + // So it is disabled. + } + + private static class TextFieldFamilySyntheticSourceSupport implements MapperTestCase.SyntheticSourceSupport { + private final String fieldType; + private final boolean storeTextField; + private final boolean storedKeywordField; + private final boolean indexText; + private final Integer ignoreAbove; + private final KeywordFieldSyntheticSourceSupport keywordSupport; + + TextFieldFamilySyntheticSourceSupport(String fieldType, boolean supportsCustomIndexConfiguration) { + this.fieldType = fieldType; + this.storeTextField = randomBoolean(); + this.storedKeywordField = storeTextField || randomBoolean(); + this.indexText = supportsCustomIndexConfiguration ? randomBoolean() : true; + this.ignoreAbove = randomBoolean() ? null : between(10, 100); + this.keywordSupport = new KeywordFieldSyntheticSourceSupport(ignoreAbove, storedKeywordField, null, false == storeTextField); + } + + @Override + public MapperTestCase.SyntheticSourceExample example(int maxValues) { + if (storeTextField) { + MapperTestCase.SyntheticSourceExample delegate = keywordSupport.example(maxValues, true); + return new MapperTestCase.SyntheticSourceExample( + delegate.inputValue(), + delegate.expectedForSyntheticSource(), + delegate.expectedForBlockLoader(), + b -> { + b.field("type", fieldType); + b.field("store", true); + if (indexText == false) { + b.field("index", false); + } + } + ); + } + // We'll load from _source if ignore_above is defined, otherwise we load from the keyword field. + boolean loadingFromSource = ignoreAbove != null; + MapperTestCase.SyntheticSourceExample delegate = keywordSupport.example(maxValues, loadingFromSource); + return new MapperTestCase.SyntheticSourceExample( + delegate.inputValue(), + delegate.expectedForSyntheticSource(), + delegate.expectedForBlockLoader(), + b -> { + b.field("type", fieldType); + if (indexText == false) { + b.field("index", false); + } + b.startObject("fields"); + { + b.startObject(randomAlphaOfLength(4)); + delegate.mapping().accept(b); + b.endObject(); + } + b.endObject(); + } + ); + } + + @Override + public List invalidExample() throws IOException { + Matcher err = equalTo( + String.format( + Locale.ROOT, + "field [field] of type [%s] doesn't support synthetic source unless it is stored or" + + " has a sub-field of type [keyword] with doc values or stored and without a normalizer", + fieldType + ) + ); + return List.of( + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> b.field("type", fieldType)), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("l"); + b.field("type", "long"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("normalizer", "lowercase"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.field("store", "false"); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.endObject(); + } + b.endObject(); + }), + new MapperTestCase.SyntheticSourceInvalidExample(err, b -> { + b.field("type", fieldType); + b.startObject("fields"); + { + b.startObject("kwd"); + b.field("type", "keyword"); + b.field("doc_values", "false"); + b.field("store", "false"); + b.endObject(); + } + b.endObject(); + }) + ); + } + } +}