From 6e0ffbaf5e1098690a55f2d8d5332fe666ab071a Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Sun, 21 Nov 2021 10:25:05 +0200 Subject: [PATCH 1/3] remove the rows limit on process_files to allow importing historical data --- anyway_etl/cbs/cli.py | 5 ++--- anyway_etl/cbs/config.py | 2 -- anyway_etl/cbs/process_files.py | 20 +++----------------- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/anyway_etl/cbs/cli.py b/anyway_etl/cbs/cli.py index abfbe5e..02571cb 100644 --- a/anyway_etl/cbs/cli.py +++ b/anyway_etl/cbs/cli.py @@ -14,11 +14,10 @@ def import_emails(): @cbs.command() -@click.option('--limit-rows') -def process_files(**kwargs): +def process_files(): """Extract and process the cbs files""" from . import process_files - process_files.main(**kwargs) + process_files.main() @cbs.command() diff --git a/anyway_etl/cbs/config.py b/anyway_etl/cbs/config.py index 078aa4a..3b19689 100644 --- a/anyway_etl/cbs/config.py +++ b/anyway_etl/cbs/config.py @@ -15,5 +15,3 @@ IMAP_MAIL_USER = os.environ.get('IMAP_MAIL_USER') IMAP_MAIL_PASSWORD = os.environ.get('IMAP_MAIL_PASSWORD') IMAP_MAIL_HOST = os.environ.get('IMAP_MAIL_HOST', "imap.gmail.com") - -PREPROCESS_FILES_DEFAULT_LIMIT_ROWS = int(os.environ.get('PREPROCESS_FILES_DEFAULT_LIMIT_ROWS', '2')) diff --git a/anyway_etl/cbs/process_files.py b/anyway_etl/cbs/process_files.py index 33a11cb..2d7f49e 100644 --- a/anyway_etl/cbs/process_files.py +++ b/anyway_etl/cbs/process_files.py @@ -9,7 +9,6 @@ from .config import ( CBS_EMAILS_DATA_ROOT_PATH, CBS_FILES_ROOT_PATH, - PREPROCESS_FILES_DEFAULT_LIMIT_ROWS, CBS_YEARLY_DIRECTORIES_ROOT_PATH ) @@ -26,16 +25,6 @@ } -def limit_last_rows(limit_rows): - - def _iterator(rows): - for i, row in enumerate(rows): - if i < limit_rows: - yield row - - return _iterator - - def extract_zip_files(row): zip_filepath = os.path.join(CBS_FILES_ROOT_PATH, row['filename']) row['extracted_path'] = row['filename'].replace('.zip', '') @@ -92,14 +81,11 @@ def save_to_directory_structure(row): shutil.copy(file.path, target_file_path) - -def main(limit_rows=None): - limit_rows = int(limit_rows) if limit_rows else PREPROCESS_FILES_DEFAULT_LIMIT_ROWS - stats = defaultdict +def main(): + stats = defaultdict(int) _, df_stats = DF.Flow( DF.load(os.path.join(CBS_EMAILS_DATA_ROOT_PATH, 'datapackage.json')), DF.sort_rows('mtime', reverse=True), - limit_last_rows(limit_rows), DF.add_field('extracted_path', 'string'), extract_zip_files, update_cbs_files_names, @@ -111,4 +97,4 @@ def main(limit_rows=None): DF.printer() ).process() pprint(df_stats) - pprint(stats) + pprint(dict(stats)) From dff0e29b42e8d145d01b8d8aa832e1842bd46be1 Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Sun, 21 Nov 2021 16:28:32 +0200 Subject: [PATCH 2/3] README: add note regarding testing on dev before merging --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index a5a6fb6..d8884b4 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,9 @@ For more advanced documentation see the [docs](docs) directory. * Every [release](https://github.com/hasadna/anyway-etl/releases) causes deployment to the Kubernetes cluster's `anyway` environment (the production environment) +## Testing on dev environment before merging + +To test changes on dev environment before merging them to main branch - +edit the `airflow-scheduler` deployment on `anyway-dev` namespace and set +`ANYWAY_ETL_BRANCH` env var to the name of the branch with changes you want +to test. Once testing is done, revert back to `main`. From 710eb532eee2b0e838f003c01f6a98cc3362a573 Mon Sep 17 00:00:00 2001 From: Ori Hoch Date: Mon, 27 Dec 2021 10:18:21 +0200 Subject: [PATCH 3/3] fix bugs in process_files for importing old data (+add cbs jupyter notebook) --- .gitignore | 1 + anyway_etl/cbs/config.py | 1 + anyway_etl/cbs/parse_common.py | 2 + anyway_etl/cbs/process_files.py | 51 +- notebooks/cbs.ipynb | 1549 +++++++++++++++++++++++++++++++ notebooks/requirements.txt | 2 + 6 files changed, 1585 insertions(+), 21 deletions(-) create mode 100644 notebooks/cbs.ipynb create mode 100644 notebooks/requirements.txt diff --git a/.gitignore b/.gitignore index b5acdd4..5aae6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__ .airflow.env .etl.env .vscode +.ipynb_checkpoints diff --git a/anyway_etl/cbs/config.py b/anyway_etl/cbs/config.py index 3b19689..ac44c35 100644 --- a/anyway_etl/cbs/config.py +++ b/anyway_etl/cbs/config.py @@ -6,6 +6,7 @@ CBS_DATA_ROOT_PATH = os.path.join(ANYWAY_ETL_DATA_ROOT_PATH, 'cbs') CBS_EMAILS_DATA_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'emails') CBS_FILES_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'files') +CBS_PROCESSED_FILES_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'processed_files') CBS_YEARLY_DIRECTORIES_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'yearly') CBS_ACCIDENT_MARKERS_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'accident_markers') CBS_INVOLVED_ROOT_PATH = os.path.join(CBS_DATA_ROOT_PATH, 'involved') diff --git a/anyway_etl/cbs/parse_common.py b/anyway_etl/cbs/parse_common.py index 30e6b4c..f370a75 100644 --- a/anyway_etl/cbs/parse_common.py +++ b/anyway_etl/cbs/parse_common.py @@ -30,6 +30,7 @@ def common_get_data_iterator(load_start_year, stats, get_data_iterator): 'accidents_type_{}'.format(provider_code), str(year) ) + print("cbs_files_dir={}".format(cbs_files_dir)) files_from_cbs = get_files(cbs_files_dir) if len(files_from_cbs) == 0: stats['invalid_directories_without_cbs_files'] += 1 @@ -46,6 +47,7 @@ def common_get_data_iterator(load_start_year, stats, get_data_iterator): def common_main(load_start_year, output_path, get_data_iterator): load_start_year = int(load_start_year) if load_start_year else datetime.datetime.now().year - 1 + print("load_start_year={} output_path={}".format(load_start_year, output_path)) stats = defaultdict(int) _, df_stats = DF.Flow( common_get_data_iterator(load_start_year, stats, get_data_iterator), diff --git a/anyway_etl/cbs/process_files.py b/anyway_etl/cbs/process_files.py index 2d7f49e..71b3e3a 100644 --- a/anyway_etl/cbs/process_files.py +++ b/anyway_etl/cbs/process_files.py @@ -9,7 +9,8 @@ from .config import ( CBS_EMAILS_DATA_ROOT_PATH, CBS_FILES_ROOT_PATH, - CBS_YEARLY_DIRECTORIES_ROOT_PATH + CBS_YEARLY_DIRECTORIES_ROOT_PATH, + CBS_PROCESSED_FILES_ROOT_PATH ) @@ -29,7 +30,7 @@ def extract_zip_files(row): zip_filepath = os.path.join(CBS_FILES_ROOT_PATH, row['filename']) row['extracted_path'] = row['filename'].replace('.zip', '') extracted_path = os.path.join(CBS_FILES_ROOT_PATH, row['extracted_path']) - print("Extracting {} -> {}".format(zip_filepath, extracted_path)) + # print("Extracting {} -> {}".format(zip_filepath, extracted_path)) shutil.rmtree(extracted_path, ignore_errors=True) os.makedirs(extracted_path) with zipfile.ZipFile(zip_filepath, "r") as zf: @@ -38,7 +39,7 @@ def extract_zip_files(row): def update_cbs_files_names(row): extracted_path = os.path.join(CBS_FILES_ROOT_PATH, row['extracted_path']) - print('updating cbs file names {}'.format(extracted_path)) + # print('updating cbs file names {}'.format(extracted_path)) files = sorted([path for path in os.listdir(extracted_path)]) for file in files: file_path = os.path.join(extracted_path, file) @@ -63,37 +64,45 @@ def get_provider_code_and_year(row): row['year'] = int(year) -def save_to_directory_structure(row): - extracted_path = os.path.join(CBS_FILES_ROOT_PATH, row['extracted_path']) - provider_code = row['provider_code'] - year = row['year'] - base_file_path = os.path.join( - CBS_YEARLY_DIRECTORIES_ROOT_PATH, - 'accidents_type_{}'.format(provider_code), - str(year) - ) - shutil.rmtree(base_file_path, ignore_errors=True) - os.makedirs(base_file_path) - row['num_files'] = 0 - for file in os.scandir(extracted_path): - row['num_files'] += 1 - target_file_path = os.path.join(base_file_path, os.path.basename(file.path)) - shutil.copy(file.path, target_file_path) +def save_to_directory_structure(rows): + updated_year_provider_codes = set() + for row in rows: + row['num_saved_files'] = 0 + provider_code = row['provider_code'] + year = row['year'] + if '{}|{}'.format(year, provider_code) not in updated_year_provider_codes: + updated_year_provider_codes.add('{}|{}'.format(year, provider_code)) + extracted_path = os.path.join(CBS_FILES_ROOT_PATH, row['extracted_path']) + base_file_path = os.path.join( + CBS_YEARLY_DIRECTORIES_ROOT_PATH, + 'accidents_type_{}'.format(provider_code), + str(year) + ) + print('Saving msgId {} mtime {} to {}'.format(row['msgId'], row['mtime'], base_file_path)) + shutil.rmtree(base_file_path, ignore_errors=True) + os.makedirs(base_file_path) + file: os.DirEntry + for file in os.scandir(extracted_path): + row['num_saved_files'] += 1 + target_file_path = os.path.join(base_file_path, os.path.basename(file.path)) + shutil.copy(file.path, target_file_path) + yield row def main(): stats = defaultdict(int) _, df_stats = DF.Flow( DF.load(os.path.join(CBS_EMAILS_DATA_ROOT_PATH, 'datapackage.json')), - DF.sort_rows('mtime', reverse=True), DF.add_field('extracted_path', 'string'), extract_zip_files, update_cbs_files_names, DF.add_field('provider_code', 'integer'), DF.add_field('year', 'integer'), get_provider_code_and_year, - DF.add_field('num_files', 'integer'), + DF.sort_rows('{year}|{provider_code}', reverse=True), + DF.add_field('num_saved_files', 'integer'), save_to_directory_structure, + DF.dump_to_path(CBS_PROCESSED_FILES_ROOT_PATH), DF.printer() ).process() pprint(df_stats) diff --git a/notebooks/cbs.ipynb b/notebooks/cbs.ipynb new file mode 100644 index 0000000..9d2a864 --- /dev/null +++ b/notebooks/cbs.ipynb @@ -0,0 +1,1549 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0b190d1d-df67-4eed-a149-b32aa3258707", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import dataflows as df\n", + "from pprint import pprint\n", + "from collections import defaultdict\n", + "\n", + "if not os.environ.get('IMAP_MAIL_USER') or not os.environ.get('IMAP_MAIL_PASSWORD'):\n", + " print('missing IMAP_MAIL_USER or IMAP_MAIL_PASSWORD env vars')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "47699aa0-ab98-48bb-83a3-ab051339731e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initializing imap session...\n", + "Search imap messages...\n", + "Processing 85 messages..\n", + "{'bytes': 6655,\n", + " 'count_of_rows': 93,\n", + " 'dataset_name': None,\n", + " 'hash': 'd6a1142459271c5e26c32792495c14c0'}\n", + "{'invalid_content_type': 416,\n", + " 'invalid_filename': 92,\n", + " 'msgIds': 85,\n", + " 'valid_files': 93}\n" + ] + } + ], + "source": [ + "!anyway-etl cbs import-emails" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "42dbebc0-1abc-474c-9c28-199858ce2b1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "res_1:\n", + "# msgId mtime filename filesize year\n", + " (any) (datetime) (string) (integer) (any)\n", + "--- ------- ------------------- ----------------------- ----------- -------\n", + "1 b'1' 2019-02-07 06:39:43 2019/02/07_06_39_43.zip 1140230 2019\n", + "2 b'2' 2019-02-26 13:53:51 2019/02/26_13_53_51.zip 2905508 2019\n", + "...\n", + "93 b'85' 2021-12-15 15:48:42 2021/12/15_15_48_42.zip 2238841 2021\n", + "{'year_2019': 25, 'year_2020': 34, 'year_2021': 34}\n" + ] + } + ], + "source": [ + "stats = defaultdict(int)\n", + "\n", + "def update_stats(row):\n", + " stats['year_{}'.format(row['year'])] += 1\n", + "\n", + "emails = df.Flow(\n", + " df.load('../.data/cbs/emails/datapackage.json'), \n", + " df.add_computed_field(target='year', operation=lambda row: row['mtime'].year),\n", + " update_stats,\n", + " df.printer(num_rows=1)\n", + ").process()\n", + " \n", + "pprint(dict(stats))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "96e6329c-9a1d-4a13-9551-16654cedf825", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "res_1:\n", + "Saving msgId b'85' mtime 2021-12-15 15:48:42 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2021\n", + "Saving msgId b'84' mtime 2021-12-05 07:07:16 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2021\n", + "Saving msgId b'62' mtime 2021-02-09 11:26:23 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2020\n", + "Saving msgId b'68' mtime 2021-05-02 14:09:55 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2020\n", + "Saving msgId b'39' mtime 2020-05-03 09:23:31 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2019\n", + "Saving msgId b'70' mtime 2021-05-18 05:03:00 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2019\n", + "Saving msgId b'8' mtime 2019-04-07 14:23:10 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2018\n", + "Saving msgId b'45' mtime 2020-06-15 14:23:31 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2018\n", + "Saving msgId b'44' mtime 2020-06-15 14:19:48 to /home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2015\n", + "# msgId mtime filename filesize extracted_path provider_code year num_saved_files\n", + " (any) (datetime) (string) (integer) (string) (integer) (integer) (integer)\n", + "--- ------- ------------------- ----------------------- ----------- ------------------- --------------- ----------- -----------------\n", + "1 b'85' 2021-12-15 15:48:42 2021/12/15_15_48_42.zip 2238841 2021/12/15_15_48_42 3 2021 8\n", + "2 b'83' 2021-11-08 14:49:03 2021/11/08_14_49_03.zip 1933627 2021/11/08_14_49_03 3 2021 0\n", + "3 b'81' 2021-10-13 10:01:32 2021/10/13_10_01_32.zip 1711707 2021/10/13_10_01_32 3 2021 0\n", + "4 b'79' 2021-09-09 13:45:22 2021/09/09_13_45_22.zip 1448538 2021/09/09_13_45_22 3 2021 0\n", + "5 b'77' 2021-08-15 05:53:25 2021/08/15_05_53_25.zip 1173252 2021/08/15_05_53_25 3 2021 0\n", + "6 b'75' 2021-07-13 10:58:17 2021/07/13_10_58_17.zip 886222 2021/07/13_10_58_17 3 2021 0\n", + "7 b'72' 2021-06-16 04:21:10 2021/06/16_04_21_10.zip 665709 2021/06/16_04_21_10 3 2021 0\n", + "8 b'69' 2021-05-12 12:48:19 2021/05/12_12_48_19.zip 448953 2021/05/12_12_48_19 3 2021 0\n", + "9 b'66' 2021-04-13 04:58:05 2021/04/13_04_58_05.zip 262514 2021/04/13_04_58_05 3 2021 0\n", + "10 b'64' 2021-03-04 09:17:33 2021/03/04_09_17_33.zip 111151 2021/03/04_09_17_33 3 2021 0\n", + "11 b'84' 2021-12-05 07:07:16 2021/12/05_07_07_16.zip 907564 2021/12/05_07_07_16 1 2021 8\n", + "12 b'82' 2021-11-02 12:07:03 2021/11/02_12_07_03.zip 812871 2021/11/02_12_07_03 1 2021 0\n", + "13 b'80' 2021-10-10 07:38:55 2021/10/10_07_38_55.zip 738532 2021/10/10_07_38_55 1 2021 0\n", + "14 b'78' 2021-08-29 13:24:09 2021/08/29_13_24_09.zip 646225 2021/08/29_13_24_09 1 2021 0\n", + "15 b'76' 2021-08-04 06:16:04 2021/08/04_06_16_04.zip 551295 2021/08/04_06_16_04 1 2021 0\n", + "16 b'74' 2021-07-07 13:39:34 2021/07/07_13_39_34.zip 440351 2021/07/07_13_39_34 1 2021 0\n", + "17 b'71' 2021-06-06 12:43:24 2021/06/06_12_43_24.zip 354413 2021/06/06_12_43_24 1 2021 0\n", + "18 b'67' 2021-05-02 08:36:07 2021/05/02_08_36_07.zip 260778 2021/05/02_08_36_07 1 2021 0\n", + "19 b'65' 2021-03-22 11:31:44 2021/03/22_11_31_44.zip 164600 2021/03/22_11_31_44 1 2021 0\n", + "20 b'63' 2021-03-01 07:44:08 2021/03/01_07_44_08.zip 84296 2021/03/01_07_44_08 1 2021 0\n", + "...\n", + "84 b'45' 2020-06-15 14:23:31 2020/06/15_14_23_31.zip 1207077 2020/06/15_14_23_31 1 2018 0\n", + "85 b'45' 2020-06-15 14:23:31 2020/06/15_14_23_31.zip 3330353 2020/06/15_14_23_31 1 2018 0\n", + "86 b'7' 2019-03-31 12:19:28 2019/03/31_12_19_28.zip 1173976 2019/03/31_12_19_28 1 2018 0\n", + "87 b'1' 2019-02-07 06:39:43 2019/02/07_06_39_43.zip 1140230 2019/02/07_06_39_43 1 2018 0\n", + "88 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 1226768 2020/06/15_14_19_48 1 2015 8\n", + "89 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 3381217 2020/06/15_14_19_48 1 2015 0\n", + "90 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 1160519 2020/06/15_14_19_48 1 2015 0\n", + "91 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 3079144 2020/06/15_14_19_48 1 2015 0\n", + "92 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 1225275 2020/06/15_14_19_48 1 2015 0\n", + "93 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 3026879 2020/06/15_14_19_48 1 2015 0\n", + "{'bytes': 9923,\n", + " 'count_of_rows': 93,\n", + " 'dataset_name': None,\n", + " 'hash': '476f6e19be4ab74a4faed83b77f7d25d'}\n", + "{}\n" + ] + } + ], + "source": [ + "!anyway-etl cbs process-files" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e655fc34-85ef-4063-9e10-9d36c5747293", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "res_1:\n", + " # msgId mtime filename filesize extracted_path provider_code year num_saved_files\n", + " (any) (datetime) (string) (integer) (string) (integer) (integer) (integer)\n", + "--- ------- ------------------- ----------------------- ----------- ------------------- --------------- ----------- -----------------\n", + " 1 b'85' 2021-12-15 15:48:42 2021/12/15_15_48_42.zip 2238841 2021/12/15_15_48_42 3 2021 8\n", + " 2 b'84' 2021-12-05 07:07:16 2021/12/05_07_07_16.zip 907564 2021/12/05_07_07_16 1 2021 8\n", + " 3 b'62' 2021-02-09 11:26:23 2021/02/09_11_26_23.zip 2219095 2021/02/09_11_26_23 3 2020 8\n", + " 4 b'68' 2021-05-02 14:09:55 2021/05/02_14_09_55.zip 999922 2021/05/02_14_09_55 1 2020 8\n", + " 5 b'39' 2020-05-03 09:23:31 2020/05/03_09_23_31.zip 3138235 2020/05/03_09_23_31 3 2019 8\n", + " 6 b'70' 2021-05-18 05:03:00 2021/05/18_05_03_00.zip 1183108 2021/05/18_05_03_00 1 2019 8\n", + " 7 b'8' 2019-04-07 14:23:10 2019/04/07_14_23_10.zip 2964716 2019/04/07_14_23_10 3 2018 8\n", + " 8 b'45' 2020-06-15 14:23:31 2020/06/15_14_23_31.zip 1173648 2020/06/15_14_23_31 1 2018 8\n", + " 9 b'44' 2020-06-15 14:19:48 2020/06/15_14_19_48.zip 1226768 2020/06/15_14_19_48 1 2015 8\n" + ] + }, + { + "data": { + "text/plain": [ + "(, {})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Flow(\n", + " df.load('../.data/cbs/processed_files/datapackage.json'),\n", + " df.filter_rows(lambda row: row['num_saved_files'] > 0),\n", + " df.printer(num_rows=9999),\n", + ").process()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "79d5fe15-55d2-4bab-9848-375f0d135b6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing accidents...\n", + "load_start_year=2019 output_path=/home/ori/anyway-etl/.data/cbs/accident_markers\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2021\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2021\n", + "{'bytes': 79633901,\n", + " 'count_of_rows': 141629,\n", + " 'dataset_name': None,\n", + " 'hash': '1715c6ea809e8413c6dada3d7003998f'}\n", + "{'accidents': 141629, 'valid_directories': 6}\n", + "Parsing vehicles...\n", + "load_start_year=2019 output_path=/home/ori/anyway-etl/.data/cbs/vehicles\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2021\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2021\n", + "{'bytes': 16760535,\n", + " 'count_of_rows': 255845,\n", + " 'dataset_name': None,\n", + " 'hash': '8f609e45f254f50296a57f2da4f79c32'}\n", + "{'valid_directories': 6, 'valid_vehicles': 255845}\n", + "Parsing involved...\n", + "load_start_year=2019 output_path=/home/ori/anyway-etl/.data/cbs/involved\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_1/2021\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2019\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2020\n", + "cbs_files_dir=/home/ori/anyway-etl/.data/cbs/yearly/accidents_type_3/2021\n", + "{'bytes': 33193161,\n", + " 'count_of_rows': 329199,\n", + " 'dataset_name': None,\n", + " 'hash': 'd34da79029328f803bcc4a578e5ead20'}\n", + "{'valid_directories': 6, 'valid_lines': 329199}\n" + ] + } + ], + "source": [ + "!anyway-etl cbs parse-all --load-start-year 2019" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "132dff59-ef5b-499e-b0ad-75458743a4f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### accident_markers ###\n", + "num_rows_per_provider_year:\n", + "{'1_2019': 12670,\n", + " '1_2020': 10836,\n", + " '1_2021': 9541,\n", + " '3_2019': 44828,\n", + " '3_2020': 31867,\n", + " '3_2021': 31887}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
__provider_code__yearidprovider_and_idprovider_codefile_type_policetitledescriptionaddresslatitude...accident_monthaccident_dayaccident_hour_rawaccident_houraccident_minutexyvehicle_type_rsaviolation_type_rsageom
0120192019000014120190000141NoneAccident{\"REHOV1\": \"\\u05d0\\u05dc \\u05d7\\u05e0\\u05e1\\u0...אל חנסה, ירושלים31.814130078339943...413501215221906.0635701.0NoneNoneNone
1120192019000025120190000251NoneAccident{\"SHEM_ZOMET\": \"\\u05e6\\u05d5\\u05de\\u05ea \\u05d...None32.271115176105305...62857140200474.0686393.0NoneNoneNone
2120202019047557120190475571NoneAccident{\"REHOV1\": \"\\u05d3\\u05e8\\u05da \\u05d1\\u05e8 \\u...דרך בר אילן, קריית אונו32.06505086242481...12961150185530.0663581.0NoneNoneNone
3120202019061987120190619871NoneAccident{\"REHOV1\": \"\\u05d9\\u05e4\\u05d4 \\u05e0\\u05d5\\u0...יפה נוף, חדרה32.431339930929425...12330715194561.0704174.0NoneNoneNone
4120212020031644120200316441NoneAccident{\"REHOV1\": \"\\u05de\\u05e2\\u05dc\\u05d4 \\u05d9\\u0...מעלה יצחק, נצרת עילית32.70184800537962...13173180230441.0734148.0NoneNoneNone
5120212020079871120200798711NoneAccident{\"SUG_DEREH\": 4.0, \"YEHIDA\": 12.0, \"SUG_YOM\": ...None33.04566728624023...11457140245802.0772308.0NoneNoneNone
63201920180247803201802478033Accident{\"SUG_DEREH\": 4.0, \"YEHIDA\": 38.0, \"SUG_YOM\": ...None31.628287309655853...117801945160775.0615260.0NoneNoneNone
73201920180462393201804623933Accident{\"SUG_DEREH\": 4.0, \"YEHIDA\": 12.0, \"SUG_YOM\": ...None32.96017023157084...12845110209083.0762796.0NoneNoneNone
8320202019004237320190042373NoneAccident{\"SUG_DEREH\": 2.0, \"YEHIDA\": 51.0, \"SUG_YOM\": ...None32.33332661282791...14761845201495.0693290.0NoneNoneNone
9320202019005143320190051433NoneAccident{\"SHEM_ZOMET\": \"\\u05e6\\u05d5\\u05de\\u05ea \\u05d...None31.989513076695104...1573180191317.0655187.0NoneNoneNone
10320212020007043320200070433NoneAccident{\"SUG_DEREH\": 2.0, \"YEHIDA\": 51.0, \"SUG_YOM\": ...None32.31627091121968...129441045187258.0691434.0NoneNoneNone
11320212020044999320200449993NoneAccident{\"REHOV1\": \"\\u05d0\\u05d1\\u05df \\u05d2\\u05d1\\u0...אבן גבירול, תל אביב -יפו32.0853980829789...11885210179553.0665859.0NoneNoneNone
\n", + "

12 rows × 74 columns

\n", + "
" + ], + "text/plain": [ + " __provider_code __year id provider_and_id provider_code \\\n", + "0 1 2019 2019000014 12019000014 1 \n", + "1 1 2019 2019000025 12019000025 1 \n", + "2 1 2020 2019047557 12019047557 1 \n", + "3 1 2020 2019061987 12019061987 1 \n", + "4 1 2021 2020031644 12020031644 1 \n", + "5 1 2021 2020079871 12020079871 1 \n", + "6 3 2019 2018024780 32018024780 3 \n", + "7 3 2019 2018046239 32018046239 3 \n", + "8 3 2020 2019004237 32019004237 3 \n", + "9 3 2020 2019005143 32019005143 3 \n", + "10 3 2021 2020007043 32020007043 3 \n", + "11 3 2021 2020044999 32020044999 3 \n", + "\n", + " file_type_police title \\\n", + "0 None Accident \n", + "1 None Accident \n", + "2 None Accident \n", + "3 None Accident \n", + "4 None Accident \n", + "5 None Accident \n", + "6 3 Accident \n", + "7 3 Accident \n", + "8 None Accident \n", + "9 None Accident \n", + "10 None Accident \n", + "11 None Accident \n", + "\n", + " description \\\n", + "0 {\"REHOV1\": \"\\u05d0\\u05dc \\u05d7\\u05e0\\u05e1\\u0... \n", + "1 {\"SHEM_ZOMET\": \"\\u05e6\\u05d5\\u05de\\u05ea \\u05d... \n", + "2 {\"REHOV1\": \"\\u05d3\\u05e8\\u05da \\u05d1\\u05e8 \\u... \n", + "3 {\"REHOV1\": \"\\u05d9\\u05e4\\u05d4 \\u05e0\\u05d5\\u0... \n", + "4 {\"REHOV1\": \"\\u05de\\u05e2\\u05dc\\u05d4 \\u05d9\\u0... \n", + "5 {\"SUG_DEREH\": 4.0, \"YEHIDA\": 12.0, \"SUG_YOM\": ... \n", + "6 {\"SUG_DEREH\": 4.0, \"YEHIDA\": 38.0, \"SUG_YOM\": ... \n", + "7 {\"SUG_DEREH\": 4.0, \"YEHIDA\": 12.0, \"SUG_YOM\": ... \n", + "8 {\"SUG_DEREH\": 2.0, \"YEHIDA\": 51.0, \"SUG_YOM\": ... \n", + "9 {\"SHEM_ZOMET\": \"\\u05e6\\u05d5\\u05de\\u05ea \\u05d... \n", + "10 {\"SUG_DEREH\": 2.0, \"YEHIDA\": 51.0, \"SUG_YOM\": ... \n", + "11 {\"REHOV1\": \"\\u05d0\\u05d1\\u05df \\u05d2\\u05d1\\u0... \n", + "\n", + " address latitude ... accident_month \\\n", + "0 אל חנסה, ירושלים 31.814130078339943 ... 4 \n", + "1 None 32.271115176105305 ... 6 \n", + "2 דרך בר אילן, קריית אונו 32.06505086242481 ... 1 \n", + "3 יפה נוף, חדרה 32.431339930929425 ... 1 \n", + "4 מעלה יצחק, נצרת עילית 32.70184800537962 ... 1 \n", + "5 None 33.04566728624023 ... 1 \n", + "6 None 31.628287309655853 ... 1 \n", + "7 None 32.96017023157084 ... 1 \n", + "8 None 32.33332661282791 ... 1 \n", + "9 None 31.989513076695104 ... 1 \n", + "10 None 32.31627091121968 ... 1 \n", + "11 אבן גבירול, תל אביב -יפו 32.0853980829789 ... 1 \n", + "\n", + " accident_day accident_hour_raw accident_hour accident_minute x \\\n", + "0 13 50 12 15 221906.0 \n", + "1 28 57 14 0 200474.0 \n", + "2 29 61 15 0 185530.0 \n", + "3 23 30 7 15 194561.0 \n", + "4 31 73 18 0 230441.0 \n", + "5 14 57 14 0 245802.0 \n", + "6 17 80 19 45 160775.0 \n", + "7 28 45 11 0 209083.0 \n", + "8 4 76 18 45 201495.0 \n", + "9 5 73 18 0 191317.0 \n", + "10 29 44 10 45 187258.0 \n", + "11 18 85 21 0 179553.0 \n", + "\n", + " y vehicle_type_rsa violation_type_rsa geom \n", + "0 635701.0 None None None \n", + "1 686393.0 None None None \n", + "2 663581.0 None None None \n", + "3 704174.0 None None None \n", + "4 734148.0 None None None \n", + "5 772308.0 None None None \n", + "6 615260.0 None None None \n", + "7 762796.0 None None None \n", + "8 693290.0 None None None \n", + "9 655187.0 None None None \n", + "10 691434.0 None None None \n", + "11 665859.0 None None None \n", + "\n", + "[12 rows x 74 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### involved ###\n", + "num_rows_per_provider_year:\n", + "{'1_2019': 33333,\n", + " '1_2020': 26975,\n", + " '1_2021': 24407,\n", + " '3_2019': 101544,\n", + " '3_2020': 71086,\n", + " '3_2021': 71854}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
__provider_code__yearaccident_idprovider_and_idprovider_codefile_type_policeinvolved_typelicense_acquiring_dateage_groupsex...home_yishuv_shapehospital_timemedical_typerelease_destsafety_measures_uselate_deceasedcar_idinvolve_idaccident_yearaccident_month
0120192019000014120190000141None30162...12.0NoneNoneNoneNaN1.00120194
1120192019000014120190000141None1201651...12.0NoneNoneNoneNaNNaN1220194
2120202020028982120200289821None2200282...15.0NoneNoneNone1.0NaN1220201
3120202020028982120200289821None21991112...31.0NoneNoneNoneNaNNaN2120201
4120212020031644120200316441None3042...16.0NoneNoneNoneNaNNaN0220211
5120212020031644120200316441None1200581...25.0NoneNoneNoneNaNNaN1120211
632019201802478032018024780332201261...13.0NoneNoneNoneNaNNaN1120191
7320192018024780320180247803310990...NaNNoneNoneNoneNaNNaN2220191
8320202019004237320190042373None10990...NaNNoneNoneNoneNaNNaN1220201
9320202019004237320190042373None2201761...26.0NoneNoneNoneNaNNaN2120201
10320212020007043320200070433None2201361...26.0NoneNoneNoneNaNNaN1120211
11320212020044999320200449993None1201171...15.0NoneNoneNoneNaNNaN1220211
\n", + "

12 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " __provider_code __year accident_id provider_and_id provider_code \\\n", + "0 1 2019 2019000014 12019000014 1 \n", + "1 1 2019 2019000014 12019000014 1 \n", + "2 1 2020 2020028982 12020028982 1 \n", + "3 1 2020 2020028982 12020028982 1 \n", + "4 1 2021 2020031644 12020031644 1 \n", + "5 1 2021 2020031644 12020031644 1 \n", + "6 3 2019 2018024780 32018024780 3 \n", + "7 3 2019 2018024780 32018024780 3 \n", + "8 3 2020 2019004237 32019004237 3 \n", + "9 3 2020 2019004237 32019004237 3 \n", + "10 3 2021 2020007043 32020007043 3 \n", + "11 3 2021 2020044999 32020044999 3 \n", + "\n", + " file_type_police involved_type license_acquiring_date age_group sex \\\n", + "0 None 3 0 16 2 \n", + "1 None 1 2016 5 1 \n", + "2 None 2 2002 8 2 \n", + "3 None 2 1991 11 2 \n", + "4 None 3 0 4 2 \n", + "5 None 1 2005 8 1 \n", + "6 3 2 2012 6 1 \n", + "7 3 1 0 99 0 \n", + "8 None 1 0 99 0 \n", + "9 None 2 2017 6 1 \n", + "10 None 2 2013 6 1 \n", + "11 None 1 2011 7 1 \n", + "\n", + " ... home_yishuv_shape hospital_time medical_type release_dest \\\n", + "0 ... 12.0 None None None \n", + "1 ... 12.0 None None None \n", + "2 ... 15.0 None None None \n", + "3 ... 31.0 None None None \n", + "4 ... 16.0 None None None \n", + "5 ... 25.0 None None None \n", + "6 ... 13.0 None None None \n", + "7 ... NaN None None None \n", + "8 ... NaN None None None \n", + "9 ... 26.0 None None None \n", + "10 ... 26.0 None None None \n", + "11 ... 15.0 None None None \n", + "\n", + " safety_measures_use late_deceased car_id involve_id accident_year \\\n", + "0 NaN 1.0 0 1 2019 \n", + "1 NaN NaN 1 2 2019 \n", + "2 1.0 NaN 1 2 2020 \n", + "3 NaN NaN 2 1 2020 \n", + "4 NaN NaN 0 2 2021 \n", + "5 NaN NaN 1 1 2021 \n", + "6 NaN NaN 1 1 2019 \n", + "7 NaN NaN 2 2 2019 \n", + "8 NaN NaN 1 2 2020 \n", + "9 NaN NaN 2 1 2020 \n", + "10 NaN NaN 1 1 2021 \n", + "11 NaN NaN 1 2 2021 \n", + "\n", + " accident_month \n", + "0 4 \n", + "1 4 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "5 1 \n", + "6 1 \n", + "7 1 \n", + "8 1 \n", + "9 1 \n", + "10 1 \n", + "11 1 \n", + "\n", + "[12 rows x 32 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### vehicles ###\n", + "num_rows_per_provider_year:\n", + "{'1_2019': 22773,\n", + " '1_2020': 19498,\n", + " '1_2021': 17413,\n", + " '3_2019': 80747,\n", + " '3_2020': 57481,\n", + " '3_2021': 57933}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
__provider_code__yearaccident_idprovider_and_idprovider_codefile_type_policeengine_volumemanufacturing_yeardriving_directionsvehicle_statusvehicle_attributionvehicle_typeseatstotal_weightcar_idaccident_yearaccident_monthvehicle_damage
0120192019070360120190703601None8200211220111NaN1201912
1120192019074088120190740881None120182211018991.01201942
2120202020068206120200682061None72017112201132.02202062
3120202020089614120200896141None62015443301131.01202071
4120212021005980120210059801None42020221101131.02202111
5120212021043950120210439501None620033344011991.01202121
6320192019099566320190995663382014001131.01201954
7320192019099566320190995663362018001131.02201951
8320202020056894320200568943None52015001191NaN1202014
9320202020054123320200541233None0000117990.02202020
10320212021091011320210910113None122016001113911.01202114
11320212021033792320210337923None62020001131.02202124
\n", + "
" + ], + "text/plain": [ + " __provider_code __year accident_id provider_and_id provider_code \\\n", + "0 1 2019 2019070360 12019070360 1 \n", + "1 1 2019 2019074088 12019074088 1 \n", + "2 1 2020 2020068206 12020068206 1 \n", + "3 1 2020 2020089614 12020089614 1 \n", + "4 1 2021 2021005980 12021005980 1 \n", + "5 1 2021 2021043950 12021043950 1 \n", + "6 3 2019 2019099566 32019099566 3 \n", + "7 3 2019 2019099566 32019099566 3 \n", + "8 3 2020 2020056894 32020056894 3 \n", + "9 3 2020 2020054123 32020054123 3 \n", + "10 3 2021 2021091011 32021091011 3 \n", + "11 3 2021 2021033792 32021033792 3 \n", + "\n", + " file_type_police engine_volume manufacturing_year driving_directions \\\n", + "0 None 8 2002 1122 \n", + "1 None 1 2018 2211 \n", + "2 None 7 2017 1122 \n", + "3 None 6 2015 4433 \n", + "4 None 4 2020 2211 \n", + "5 None 6 2003 3344 \n", + "6 3 8 2014 0 \n", + "7 3 6 2018 0 \n", + "8 None 5 2015 0 \n", + "9 None 0 0 0 \n", + "10 None 12 2016 0 \n", + "11 None 6 2020 0 \n", + "\n", + " vehicle_status vehicle_attribution vehicle_type seats total_weight \\\n", + "0 0 1 1 1 NaN \n", + "1 0 1 8 99 1.0 \n", + "2 0 1 1 3 2.0 \n", + "3 0 1 1 3 1.0 \n", + "4 0 1 1 3 1.0 \n", + "5 0 1 1 99 1.0 \n", + "6 0 1 1 3 1.0 \n", + "7 0 1 1 3 1.0 \n", + "8 0 1 19 1 NaN \n", + "9 0 1 17 99 0.0 \n", + "10 0 1 11 39 11.0 \n", + "11 0 1 1 3 1.0 \n", + "\n", + " car_id accident_year accident_month vehicle_damage \n", + "0 1 2019 1 2 \n", + "1 1 2019 4 2 \n", + "2 2 2020 6 2 \n", + "3 1 2020 7 1 \n", + "4 2 2021 1 1 \n", + "5 1 2021 2 1 \n", + "6 1 2019 5 4 \n", + "7 2 2019 5 1 \n", + "8 1 2020 1 4 \n", + "9 2 2020 2 0 \n", + "10 1 2021 1 4 \n", + "11 2 2021 2 4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from IPython.display import display\n", + "\n", + "for parse_type in ['accident_markers', 'involved', 'vehicles']:\n", + " print('### {} ###'.format(parse_type))\n", + " num_rows_per_provider_year = defaultdict(int)\n", + " top_rows = []\n", + "\n", + " def get_rows_per_provider_year(row):\n", + " num_rows_per_provider_year['{}_{}'.format(row['__provider_code'], row['__year'])] += 1\n", + " if num_rows_per_provider_year['{}_{}'.format(row['__provider_code'], row['__year'])] <= 2:\n", + " top_rows.append(row)\n", + "\n", + " df.Flow(\n", + " df.load('../.data/cbs/{}/datapackage.json'.format(parse_type)),\n", + " get_rows_per_provider_year\n", + " ).process()\n", + " \n", + " print('num_rows_per_provider_year:')\n", + " pprint(dict(num_rows_per_provider_year))\n", + " display(pd.DataFrame(top_rows))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 0000000..6f96538 --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1,2 @@ +jupyter==1.0.0 +jupyterlab==3.2.5