dataset API

google · Oct 28, 2021 · 3523a55 · 3523a55
1 parent f896e85
commit 3523a55
Show file tree

Hide file tree

Showing 10 changed files with 1,011 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,8 @@ __pycache__/
 *.prof
 # C extensions
 *.so
+tmp/
+algo_arch_implem_v1_train/
 
 # Distribution / packaging
 .Python

diff --git a/notebooks/dataset_api.ipynb b/notebooks/dataset_api.ipynb
@@ -0,0 +1,279 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scaaml.io import Dataset\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# specify dummy data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_path = './'\n",
+    "architecture = 'arch'\n",
+    "implementation = 'implem'\n",
+    "algorithm = 'algo'\n",
+    "version = 1\n",
+    "minfo = {\n",
+    "    \"trace1\": {\n",
+    "        \"type\": \"power\",\n",
+    "        \"len\": 1024,\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "apinfo = {\n",
+    "    \"key\": {\n",
+    "        \"len\": 16,\n",
+    "        \"max_val\": 256\n",
+    "    }\n",
+    "}\n",
+    "chip_id = 1   # which chip this was captured on\n",
+    "comment= \"this is a test\" \n",
+    "purpose = \"train\"\n",
+    "example_per_shard = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## generate fake data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "key = np.random.randint(0, 255, 16)\n",
+    "key2 = np.random.randint(0, 255, 16)\n",
+    "trace1 = np.random.rand(1024)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Creating dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## init"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33m[Warning] Path exist, some files might be over-written\u001b[0m\n",
+      "\u001b[32mDataset path: algo_arch_implem_v1_train\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds = Dataset(root_path=root_path,\n",
+    "             architecture=architecture,\n",
+    "             implementation=implementation,\n",
+    "             algorithm=algorithm,\n",
+    "             version=version,\n",
+    "             purpose=purpose,\n",
+    "             comment=comment,\n",
+    "             chip_id=chip_id,\n",
+    "             examples_per_shard=example_per_shard,\n",
+    "             measurements_info=minfo,\n",
+    "             attack_points_info=apinfo)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## writing shard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train\n",
+      "defaultdict(<class 'int'>, {})\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds.new_shard(key, 1, split='train')\n",
+    "ds.write_example({\"key\": key,\n",
+    "                  #\"sub_byte_in\": key\n",
+    "                  }, {\"trace1\": trace1})\n",
+    "ds.close_shard()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using a dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_path = './algo_arch_implem_v1_train'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# display dataset info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m[Dataset Summary]\u001b[0m\n",
+      "\u001b[33mInfo\u001b[0m\n",
+      "--------------  --------------\n",
+      "architecture    arch\n",
+      "implementation  implem\n",
+      "algorithm       algo\n",
+      "version         1\n",
+      "chip_id         1\n",
+      "comment         this is a test\n",
+      "purpose         train\n",
+      "compression     GZIP\n",
+      "--------------  --------------\n",
+      "\u001b[33m\n",
+      "Attack Points\u001b[0m\n",
+      "ap      len    max_val\n",
+      "----  -----  ---------\n",
+      "key      16        256\n",
+      "\u001b[35m\n",
+      "Measurements\u001b[0m\n",
+      "name    type      len\n",
+      "------  ------  -----\n",
+      "trace1  power    1024\n",
+      "\u001b[32m\n",
+      "Content\u001b[0m\n",
+      "split      num_keys    num_examples\n",
+      "-------  ----------  --------------\n",
+      "train             1               1\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "Dataset.summary(dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[35mreloading algo_arch_implem_v1_train\\info.json\u001b[0m\n",
+      "\u001b[32mDataset path: algo_arch_implem_v1_train\\algo_arch_implem_v1_train\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "trace_len = 1024\n",
+    "\n",
+    "train_ds, inputs, outputs = Dataset.as_tfdataset(dataset_path, \n",
+    "                                           split='train', \n",
+    "                                           attack_points='key',\n",
+    "                                           traces='trace1',\n",
+    "                                           traces_max_len=trace_len,\n",
+    "                                           bytes=1,\n",
+    "                                           shards=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x -> dict_keys(['trace1']) (32, 32, 32)\n",
+      "y -> dict_keys(['key_1']) (32, 256)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for batch in train_ds.take(1):\n",
+    "    print('x ->', batch[0].keys(), batch[0]['trace1'].shape)\n",
+    "    print('y ->', batch[1].keys(), batch[1]['key_1'].shape)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "f4b45c82ab6242cd8608f401505676f0fc37e99e72925447dbc1e4dcd37ea533"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.10 64-bit ('venv': venv)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scaaml/io/README.md b/scaaml/io/README.md
diff --git a/scaaml/io/__init__.py b/scaaml/io/__init__.py
@@ -0,0 +1 @@
+from .dataset import Dataset  # noqa
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,6 +19,8 @@ __pycache__/ @@
     *.prof
     # C extensions
     *.so
+    tmp/
+    algo_arch_implem_v1_train/
     # Distribution / packaging
     .Python
@@ Expand Down @@