Skip to content

Commit

Permalink
Merge pull request #158 from diffix/piotr/build-example-sqls
Browse files Browse the repository at this point in the history
SQL examples draft
  • Loading branch information
pdobacz authored Nov 30, 2022
2 parents 1e8f3ee + 1c57920 commit 1370379
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 5 deletions.
6 changes: 1 addition & 5 deletions src/main/metabase/api.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { ClientRequestConstructorOptions, net } from 'electron';
import { isPostgresIdentifier } from '../../shared';
import { postgresQuote } from '../../shared';
import { InitialQueryPayloads } from '../../types';
import { metabaseConfig, postgresConfig } from '../config';
import { getAppLanguage } from '../language';
Expand Down Expand Up @@ -29,10 +29,6 @@ function findAnonymizedAccessDbId(databases: Database[]) {
}
}

function postgresQuote(name: string) {
return isPostgresIdentifier(name) ? name : `"${name}"`;
}

const sqlHint = `
-- HINTS
-- Change, add, or remove columns as desired.
Expand Down
162 changes: 162 additions & 0 deletions src/main/metabase/examples.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import { postgresQuote } from '../../shared';
import { Field, Table } from './types';

type Display = 'table' | 'bar' | 'row' | 'scalar' | 'map'; // Other types TBD.

/** An example query card. */
type ExampleQuery = {
name: string; // Title of card.
sql: string; // SQL query.
sizeX: number; // Grid of 18 units wide.
sizeY: number; // Height of card in units.
display: Display;
visualizationSettings: Record<string, unknown>; // To be typed later.

// There's also row/col properties, but we'll make some rectangle
// packing algorithm to arrange cards automatically in a section.
};

/** A section for a group of examples. */
type ExamplesSection = {
title: string | null; // Markdown text as the section heading.
queries: ExampleQuery[]; // Cards in section.
};

type ExampleInfo = {
sql: string;
name: string;
};

function lines(...lines: string[]) {
return lines.join('\n');
}

const numberFieldTypes = ['int2', 'int4', 'int8', 'float4', 'float8', 'numeric'];

function rawGroupBySQL(column: string, table: string, displayName: string): ExampleInfo {
return {
name: `${displayName} by ${column}`,
sql: lines(`SELECT ${postgresQuote(column)}`, `FROM ${postgresQuote(table)}`, `GROUP BY ${postgresQuote(column)}`),
};
}

function countDistinctSQL(column: string, table: string): ExampleInfo {
return {
name: `Distinct ${column}`,
sql: lines(
`SELECT count(distinct ${postgresQuote(column)}) as ${postgresQuote('distinct_' + column)}`,
`FROM ${postgresQuote(table)}`,
),
};
}

function avgSQL(column: string, table: string): ExampleInfo {
return {
name: `Average ${column}`,
sql: lines(
`SELECT avg(${postgresQuote(column)}) as ${postgresQuote('avg_' + column)}`,
`FROM ${postgresQuote(table)}`,
),
};
}

function textGeneralizedSQL(column: string, table: string, displayName: string, averageLength: number): ExampleInfo {
const nChars = Math.ceil(averageLength / 4);
const stars = '*'.repeat(Math.ceil(averageLength - nChars));
const bucket = `substring(${postgresQuote(column)}, 1, ${nChars})`;

return {
name: `${displayName} by ${column}`,
sql: lines(`SELECT ${bucket} || ${stars}, count(*)`, `FROM ${postgresQuote(table)}`, `GROUP BY ${bucket}`),
};
}

function yearlyGeneralizedSQL(column: string, table: string, displayName: string): ExampleInfo {
const bucket = `extract(year from ${postgresQuote(column)})`;

return {
name: `${displayName} by ${column} year`,
sql: lines(
`SELECT ${bucket} as ${postgresQuote(column + '_year')}, count(*)`,
`FROM ${postgresQuote(table)}`,
`GROUP BY ${bucket}`,
),
};
}

function makeExampleInfos(field: Field, table: Table, aidColumns: string[]): ExampleInfo[] {
try {
if (field.semantic_type === 'type/PK' || field.database_type === 'serial') {
// No sensible example for columns being just row IDs.
return [];
} else if (aidColumns.includes(field.name)) {
// Never SELECT/GROUP BY AIDs directly, also no point in generalizing.
return [countDistinctSQL(field.name, table.name)];
} else if (field.database_type === 'text' && field.fingerprint) {
if (field.fingerprint.global['distinct-count'] && field.fingerprint.global['distinct-count'] < 10) {
// Few distinct values - can GROUP BY directly.
return [rawGroupBySQL(field.name, table.name, table.display_name)];
} else {
const averageLength = field.fingerprint.type?.['type/Text']?.['average-length'];

// The `< 20`: we want to generalize surnames and categories but not sentences, paragraphs or addresses.
if (averageLength && averageLength < 20) {
return [textGeneralizedSQL(field.name, table.name, table.display_name, averageLength)];
} else {
return [countDistinctSQL(field.name, table.name)];
}
}
} else if (numberFieldTypes.includes(field.database_type) && field.fingerprint) {
if (field.fingerprint.global['distinct-count'] && field.fingerprint.global['distinct-count'] < 10) {
// Few distinct values - can GROUP BY directly.
return [rawGroupBySQL(field.name, table.name, table.display_name)];
} else {
// TODO: Construct stable generalization. Temporarily revert to the average.
return [avgSQL(field.name, table.name)];
}
} else if (field.database_type === 'timestamp') {
// TODO: using timestamps fingerprint is possible, but we need to pull in some datetime lib.
return [yearlyGeneralizedSQL(field.name, table.name, table.display_name)];
} else {
// Fallback to the count distinct for anything else.
return [countDistinctSQL(field.name, table.name)];
}
} catch (err) {
console.warn(`Unable to make example query for ${table.name}, ${field.name}`, err);
return [];
}
}

export function exampleQueries(table: Table, aidColumns: string[]): ExamplesSection[] {
const exampleInfos = table.fields.flatMap((field) => makeExampleInfos(field, table, aidColumns));
// const t = getT('example-queries'); // Let's worry about i18n later...

return [
{
title: 'Overview',
queries: [
{
name: `Count of ${table.display_name}`,
sql: lines('SELECT count(*)', `FROM ${table.name}`),
sizeX: 6, // 6 is a good default (3 cards per row).
sizeY: 4, // 4 is a good default.
display: 'scalar',
visualizationSettings: {}, // No visualizations for now. To be done after we finish SQL.
},
],
},
{
title: `Overview of ${table.display_name} columns`,
queries: exampleInfos.map(({ name, sql }) => {
return {
name: name,
sql: sql,
sizeX: 6,
sizeY: 4, // TODO: For a table we might need something taller.
display: 'table' as Display, // For now we show results only as 'table'.
visualizationSettings: {},
};
}),
},
];
}
98 changes: 98 additions & 0 deletions src/main/metabase/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* API response types extracted from sample responses and metabase source.
* Fields that we don't care about are commented out or marked as unknown.
*/

type ISO8601Time = string;

export type Table = {
description: string;
entity_type: string;
schema: string;
// db: unknown;
show_in_getting_started: boolean;
name: string;
fields: Field[];
// caveats: unknown;
// segments: unknown[];
dimension_options: Record<number, unknown>;
updated_at: ISO8601Time;
active: boolean;
id: number;
db_id: number;
// visibility_type: unknown;
// field_order: unknown;
initial_sync_status: string;
display_name: string;
// metrics: unknown[];
created_at: ISO8601Time;
// points_of_interest: unknown;
};

export type TextFieldFingerprint = {
'percent-json': number;
'percent-url': number;
'percent-email': number;
'percent-state': number;
'average-length': number;
};

export type NumberFieldFingerprint = {
avg: number;
max: number;
min: number;
q1: number;
q3: number;
sd: number;
};

export type DateTimeFieldFingerprint = {
earliest: ISO8601Time;
latest: ISO8601Time;
};

export interface FieldFingerprint {
global: {
'distinct-count'?: number;
'nil%': number;
};
type?: {
'type/Text'?: TextFieldFingerprint;
'type/Number'?: NumberFieldFingerprint;
'type/DateTime'?: DateTimeFieldFingerprint;
};
}

export type Field = {
description: string | null;
database_type: string; // See https://github.com/metabase/metabase/blob/master/src/metabase/driver/postgres.clj#L504-L566
semantic_type: string | null; // See https://github.com/metabase/metabase/blob/master/shared/src/metabase/types.cljc
// coercion_strategy: unknown;
name: string;
fingerprint_version: number;
// has_field_values: string;
// settings: unknown;
// caveats: unknown;
// fk_target_field_id: unknown;
// dimensions: unknown[];
dimension_options: string[];
updated_at: ISO8601Time;
// custom_position: number;
effective_type: string;
active: boolean;
// nfc_path: unknown;
// parent_id: unknown;
id: number;
last_analyzed: ISO8601Time;
position: number;
visibility_type: 'details-only' | 'hidden' | 'normal' | 'retired';
// default_dimension_option: unknown;
// target: unknown;
preview_display: boolean;
display_name: string;
database_position: number;
fingerprint: FieldFingerprint;
created_at: ISO8601Time;
base_type: string;
// points_of_interest: unknown;
};
4 changes: 4 additions & 0 deletions src/shared/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,7 @@ const tableNameRE = /^[a-z_][a-z0-9$_]*$/;
export function isPostgresIdentifier(name: string): boolean {
return !postgresReservedKeywords.includes(name) && tableNameRE.test(name);
}

export function postgresQuote(name: string): string {
return isPostgresIdentifier(name) ? name : `"${name}"`;
}

0 comments on commit 1370379

Please sign in to comment.