Skip to content

Latest commit

 

History

History
126 lines (91 loc) · 2.79 KB

link-table.markdown

File metadata and controls

126 lines (91 loc) · 2.79 KB

Let's do PushEvent

CREATE TABLE pushes (
  id SERIAL PRIMARY KEY,
  event_id INTEGER NOT NULL
    REFERENCES events(event_id),
  payload JSONB NOT NULL);

INSERT INTO pushes (event_id, payload)
SELECT event_id, payload FROM events WHERE type = 'PushEvent';

Most of these are mechanical

SELECT * FROM key_count('pushes', 'payload');
SELECT explode_json_column('pushes', 'payload');
SELECT set_concrete_type('pushes', 'before', 'TEXT');
SELECT set_concrete_type('pushes', 'distinct_size', 'INTEGER');
SELECT set_concrete_type('pushes', 'head', 'TEXT');
SELECT set_concrete_type('pushes', 'push_id', 'BIGINT');
SELECT set_concrete_type('pushes', 'ref', 'TEXT');
SELECT set_concrete_type('pushes', 'size', 'INTEGER');

But what's that commits column?

SELECT * FROM type_count('pushes', 'commits');
SELECT commits->0 FROM pushes WHERE id = 1;
SELECT jsonb_array_elements(commits) FROM pushes WHERE id = 1;

Sounds like a link table

CREATE TABLE push_commits (
  id SERIAL PRIMARY KEY,
  push_id INTEGER NOT NULL
    REFERENCES PUSHES(id),
  commit JSONB NOT NULL
);

INSERT INTO push_commits (push_id, commit)
SELECT id, jsonb_array_elements(commits) FROM pushes;

Link target

CREATE TABLE commits (
  id SERIAL PRIMARY KEY,
  payload JSONB NOT NULL
);

INSERT INTO commits (payload)
SELECT DISTINCT commit FROM push_commits;

Large values are hard

ALTER TABLE push_commits
  ADD COLUMN commit_id INTEGER
    REFERENCES commits (id);

CREATE UNIQUE INDEX ON commits (payload);
CREATE UNIQUE INDEX ON commits (MD5(payload::TEXT));
CREATE INDEX ON push_commits (MD5(commit::TEXT));

Note: we don't need a unique index on push_commits because the same commit can be in multiple pushes.

UPDATE push_commits SET commit_id =
  (SELECT id FROM commits WHERE MD5(commit::TEXT) = MD5(payload::TEXT));

ALTER TABLE push_commits
  ALTER COLUMN commit_id SET NOT NULL,
  DROP COLUMN commit,
  DROP COLUMN id,
  ADD PRIMARY KEY (push_id, commit_id);

Finish with commits

SELECT * FROM type_count('commits', 'payload');
SELECT * FROM key_count('commits', 'payload');
SELECT explode_json_column('commits', 'payload');
SELECT set_concrete_type('commits', 'distinct', 'BOOLEAN');
SELECT set_concrete_type('commits', 'message', 'TEXT');
SELECT set_concrete_type('commits', 'sha', 'TEXT');
SELECT set_concrete_type('commits', 'url', 'TEXT');

Good break point

We could extract the authors table here.

SELECT * FROM type_count('commits', 'author');
SELECT * FROM key_count('commits', 'author');

But we won't, as it doesn't cover anything new.

Clean up

ALTER TABLE commits DROP COLUMN payload;
ALTER TABLE pushes DROP COLUMN payload,
  DROP COLUMN commits;
UPDATE events SET payload = NULL WHERE type = 'PushEvent';