Skip to content

Commit

Permalink
Make sequence threshold configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
nineinchnick committed Dec 31, 2024
1 parent a72150d commit 6f1cb32
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public class FakerConfig
private double nullProbability = 0.5;
private long defaultLimit = 1000L;
private Locale locale = Locale.ENGLISH;
private double sequenceMinDistinctValuesRatio = 0.98;
private long maxDictionarySize = 1000L;

@Max(1)
Expand Down Expand Up @@ -70,6 +71,24 @@ public FakerConfig setLocale(String value)
return this;
}

@Max(2)
@Min(0)
public double getSequenceMinDistinctValuesRatio()
{
return sequenceMinDistinctValuesRatio;
}

@Config("faker.sequence-min-distinct-values-ratio")
@ConfigDescription(
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table using existing data. Set to a value greater than 1 to disable using sequences""")
public FakerConfig setSequenceMinDistinctValuesRatio(double value)
{
this.sequenceMinDistinctValuesRatio = value;
return this;
}

@Min(0)
public long getMaxDictionarySize()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,17 @@ public List<PropertyMetadata<?>> getSchemaProperties()
null,
defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_SCHEMA_PROPERTY, "default_limit value must be equal or greater than 1"),
false),
doubleProperty(
SchemaInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO,
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table in this schema using existing data. Set to a value greater than 1 to disable using sequences""",
null,
sequenceMinDistinctValuesRatio -> checkProperty(
0 <= sequenceMinDistinctValuesRatio && sequenceMinDistinctValuesRatio <= 2,
INVALID_SCHEMA_PROPERTY,
SchemaInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO + " value must be between 0 and 2, inclusive"),
false),
longProperty(
SchemaInfo.MAX_DICTIONARY_SIZE,
"""
Expand All @@ -152,6 +163,17 @@ public List<PropertyMetadata<?>> getTableProperties()
null,
defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_TABLE_PROPERTY, "default_limit value must be equal or greater than 1"),
false),
doubleProperty(
TableInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO,
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table using existing data. Set to a value greater than 1 to disable using sequences""",
null,
sequenceMinDistinctValuesRatio -> checkProperty(
0 <= sequenceMinDistinctValuesRatio && sequenceMinDistinctValuesRatio <= 2,
INVALID_TABLE_PROPERTY,
TableInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO + " value must be between 0 and 2, inclusive"),
false),
longProperty(
TableInfo.MAX_DICTIONARY_SIZE,
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ public class FakerMetadata
private final List<SchemaInfo> schemas = new ArrayList<>();
private final double nullProbability;
private final long defaultLimit;
private final double sequenceMinDistinctValuesRatio;
private final long maxDictionarySize;
private final FakerFunctionProvider functionsProvider;

Expand All @@ -132,6 +133,7 @@ public FakerMetadata(FakerConfig config, FakerFunctionProvider functionProvider)
this.schemas.add(new SchemaInfo(SCHEMA_NAME, Map.of()));
this.nullProbability = config.getNullProbability();
this.defaultLimit = config.getDefaultLimit();
this.sequenceMinDistinctValuesRatio = config.getSequenceMinDistinctValuesRatio();
this.maxDictionarySize = config.getMaxDictionarySize();
this.functionsProvider = requireNonNull(functionProvider, "functionProvider is null");
this.random = new Random(1);
Expand Down Expand Up @@ -494,6 +496,10 @@ else if (metadata.getStatisticType().equals(ColumnStatisticType.NUMBER_OF_DISTIN
}

Map<String, List<Object>> columnValues = getColumnValues(tableName, info, distinctValues, minimums, maximums);
SchemaInfo schema = getSchema(tableName.getSchemaName());
double schemaMinSequenceRatio = (double) schema.properties().getOrDefault(SchemaInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO, sequenceMinDistinctValuesRatio);
double tableMinSequenceRatio = (double) info.properties().getOrDefault(TableInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO, schemaMinSequenceRatio);

return info.withColumns(columns.stream().map(column -> {
if (isCharacterType(column.type()) || !minimums.containsKey(column.name())) {
return column;
Expand All @@ -517,7 +523,7 @@ else if (metadata.getStatisticType().equals(ColumnStatisticType.NUMBER_OF_DISTIN
// Only include types that support generating sequences in FakerPageSource,
// but don't include types with configurable precision, dates, or intervals.
// The number of distinct values is an approximation, so compare it with a margin.
if (isIntegerType(column.type()) && (double) distinctValues.get(column.name()) / rowCount.get() >= 0.98) {
if (isIntegerType(column.type()) && (double) distinctValues.get(column.name()) / rowCount.get() >= tableMinSequenceRatio) {
handle = handle.withStep(ValueSet.of(column.type(), 1L));
properties.put(STEP_PROPERTY, "1");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public record SchemaInfo(String name, Map<String, Object> properties)
{
public static final String NULL_PROBABILITY_PROPERTY = "null_probability";
public static final String DEFAULT_LIMIT_PROPERTY = "default_limit";
public static final String SEQUENCE_MIN_DISTINCT_VALUES_RATIO = "sequence_min_distinct_values_ratio";
public static final String MAX_DICTIONARY_SIZE = "max_dictionary_size";

public SchemaInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public record TableInfo(List<ColumnInfo> columns, Map<String, Object> properties
{
public static final String NULL_PROBABILITY_PROPERTY = "null_probability";
public static final String DEFAULT_LIMIT_PROPERTY = "default_limit";
public static final String SEQUENCE_MIN_DISTINCT_VALUES_RATIO = "sequence_min_distinct_values_ratio";
public static final String MAX_DICTIONARY_SIZE = "max_dictionary_size";

public TableInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void testDefaults()
.setNullProbability(0.5)
.setDefaultLimit(1000L)
.setLocale("en")
.setSequenceMinDistinctValuesRatio(0.98)
.setMaxDictionarySize(1000L));
}

Expand All @@ -41,13 +42,15 @@ void testExplicitPropertyMappings()
.put("faker.null-probability", "1.0")
.put("faker.default-limit", "10")
.put("faker.locale", "pl-PL")
.put("faker.sequence-min-distinct-values-ratio", "2")
.put("faker.max-dictionary-size", "0")
.buildOrThrow();

FakerConfig expected = new FakerConfig()
.setNullProbability(1.0)
.setDefaultLimit(10L)
.setLocale("pl-PL")
.setSequenceMinDistinctValuesRatio(2.0)
.setMaxDictionarySize(0L);

assertFullMapping(properties, expected);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,14 +495,14 @@ void testCreateTableAsSelect()

for (TestDataType testCase : testCases) {
try (TestTable sourceTable = new TestTable(getQueryRunner()::execute, "ctas_src_" + testCase.name(), "(%s) WITH (null_probability = 0, default_limit = 1000)".formatted(testCase.columnSchema()));
TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, max_dictionary_size = 0) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) {
TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, max_dictionary_size = 0, sequence_min_distinct_values_ratio = 2) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) {
assertQuery("SELECT %s FROM %s".formatted(testCase.queryExpression(), table.getName()), "VALUES (%s)".formatted(testCase.expectedValue()));
}
}

for (TestDataType testCase : testCases) {
try (TestTable sourceTable = new TestTable(getQueryRunner()::execute, "ctas_src_" + testCase.name(), "(%s) WITH (null_probability = 0, default_limit = 2)".formatted(testCase.name() + " " + testCase.type()));
TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, max_dictionary_size = 2) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) {
TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, max_dictionary_size = 2, sequence_min_distinct_values_ratio = 2) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) {
assertQuery("SELECT %s FROM %s".formatted(testCase.queryExpression(), table.getName()), "VALUES (%s)".formatted(testCase.expectedValue()));
}
}
Expand Down

0 comments on commit 6f1cb32

Please sign in to comment.