From 39eb5767cc76bc9a76ac003e6eaef29444f5b18e Mon Sep 17 00:00:00 2001
From: Hiroaki Ogasawara <13391129+xhiroga@users.noreply.github.com>
Date: Fri, 12 Jan 2024 21:48:11 +0900
Subject: [PATCH] chore: (wip) CNN

---
 .../deep-learning-from-scratch/.gitignore     |   1 +
 .../_src/deep-learning-from-scratch/README.md |   5 +
 .../deep-learning-from-scratch/latest.bat     |  11 +
 .../notebooks/ch07.ipynb                      | 360 ++++++++++++++++++
 4 files changed, 377 insertions(+)
 create mode 100644 computer-science/machine-learning/_src/deep-learning-from-scratch/.gitignore
 create mode 100644 computer-science/machine-learning/_src/deep-learning-from-scratch/README.md
 create mode 100644 computer-science/machine-learning/_src/deep-learning-from-scratch/latest.bat
 create mode 100644 computer-science/machine-learning/_src/deep-learning-from-scratch/notebooks/ch07.ipynb

diff --git a/computer-science/machine-learning/_src/deep-learning-from-scratch/.gitignore b/computer-science/machine-learning/_src/deep-learning-from-scratch/.gitignore
new file mode 100644
index 000000000..f5b8c0bf1
--- /dev/null
+++ b/computer-science/machine-learning/_src/deep-learning-from-scratch/.gitignore
@@ -0,0 +1 @@
+deep-learning-from-scratch
diff --git a/computer-science/machine-learning/_src/deep-learning-from-scratch/README.md b/computer-science/machine-learning/_src/deep-learning-from-scratch/README.md
new file mode 100644
index 000000000..cab8fbd6a
--- /dev/null
+++ b/computer-science/machine-learning/_src/deep-learning-from-scratch/README.md
@@ -0,0 +1,5 @@
+# [O'Reilly Japan - ゼロから作るDeep Learning](https://www.oreilly.co.jp/books/9784873117584/)
+
+```powershell
+./latest.bat
+```
diff --git a/computer-science/machine-learning/_src/deep-learning-from-scratch/latest.bat b/computer-science/machine-learning/_src/deep-learning-from-scratch/latest.bat
new file mode 100644
index 000000000..00490afe6
--- /dev/null
+++ b/computer-science/machine-learning/_src/deep-learning-from-scratch/latest.bat
@@ -0,0 +1,11 @@
+@echo off
+SET repo_path=deep-learning-from-scratch
+
+IF NOT EXIST "%repo_path%" (
+    echo Repository not found. Cloning...
+    git clone https://github.com/oreilly-japan/deep-learning-from-scratch.git
+) ELSE (
+    echo Repository found. Updating...
+    cd %repo_path%
+    git pull
+)
diff --git a/computer-science/machine-learning/_src/deep-learning-from-scratch/notebooks/ch07.ipynb b/computer-science/machine-learning/_src/deep-learning-from-scratch/notebooks/ch07.ipynb
new file mode 100644
index 000000000..a5d86e651
--- /dev/null
+++ b/computer-science/machine-learning/_src/deep-learning-from-scratch/notebooks/ch07.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7章 畳み込みニューラルネットワーク"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "sys.path.append(os.pardir)\n",
+    "sys.path.append(f\"{os.pardir}/deep-learning-from-scratch\")\n",
+    "\n",
+    "import numpy as np\n",
+    "from beartype import beartype\n",
+    "from collections import OrderedDict\n",
+    "from nptyping import NDArray, Shape, Float, Int\n",
+    "from common.util import im2col,col2im"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(10, 1, 28, 28)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = np.random.rand(10,1,28,28)\n",
+    "x.shape\n",
+    "# (10, 1, 28, 28) ミニバッチ, チャンネル, 高さ, 幅"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9, 75)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x1 = np.random.rand(1,3,7,7)\n",
+    "col1 = im2col(x1, 5, 5, stride=1, pad=0)\n",
+    "print(col1.shape)   # (9, 75) フィルターの適用領域の数(=1*(((7-5)/1)+1)**2), 入力特徴マップの要素数"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.77557663 0.33806582 0.37237046 0.83112068 0.49650122 0.03914491\n",
+      "  0.20773407]\n",
+      " [0.51288944 0.50293884 0.85671902 0.72090202 0.27010591 0.5556635\n",
+      "  0.59713498]\n",
+      " [0.02457308 0.90281899 0.87469388 0.02009157 0.13361658 0.19458219\n",
+      "  0.02499915]\n",
+      " [0.0209891  0.9720402  0.55244596 0.95761181 0.27203654 0.97884945\n",
+      "  0.37895102]\n",
+      " [0.19315752 0.15809505 0.68105605 0.29481389 0.35691675 0.23714536\n",
+      "  0.03193322]\n",
+      " [0.61227175 0.0194606  0.16553847 0.74204764 0.42913091 0.26977271\n",
+      "  0.19511624]\n",
+      " [0.60038899 0.99727459 0.99125549 0.69182925 0.95594471 0.56926757\n",
+      "  0.72815871]]\n",
+      "[0.77557663 0.33806582 0.37237046 0.83112068 0.49650122 0.51288944\n",
+      " 0.50293884 0.85671902 0.72090202 0.27010591 0.02457308 0.90281899\n",
+      " 0.87469388 0.02009157 0.13361658 0.0209891  0.9720402  0.55244596\n",
+      " 0.95761181 0.27203654 0.19315752 0.15809505 0.68105605 0.29481389\n",
+      " 0.35691675 0.56177801 0.51579877 0.55239822 0.66654575 0.49598721\n",
+      " 0.07402139 0.5829163  0.38529097 0.04022566 0.6660402  0.04880685\n",
+      " 0.29328377 0.72894727 0.40273677 0.58444065 0.23715671 0.76501373\n",
+      " 0.2617112  0.43930511 0.8667773  0.87877624 0.21587922 0.25574579\n",
+      " 0.14144719 0.10852229 0.61531416 0.63418458 0.52175757 0.04859568\n",
+      " 0.00995961 0.62378664 0.53930438 0.68877897 0.70155326 0.37456113\n",
+      " 0.91382188 0.08136818 0.64643685 0.82147964 0.72842887 0.45922596\n",
+      " 0.46440204 0.88385712 0.36370997 0.23976922 0.87628869 0.13027836\n",
+      " 0.43845715 0.42018312 0.57520152]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(x1[0][0])\n",
+    "print(col1[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(90, 75)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x2 = np.random.rand(10,3,7,7)\n",
+    "col2 = im2col(x2, 5, 5, stride=1, pad=0)\n",
+    "print(col2.shape)   # (90, 75) フィルターの適用領域の数(=10*(((7-5)/1)+1)**2), 入力特徴マップの要素数"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def output_size(x_len, pad, filter_len, stride):\n",
+    "    rest = ((x_len + 2 * pad) - filter_len)\n",
+    "    if rest < 0:\n",
+    "        raise Exception(f\"Filter length {filter_len} is longer than input size {x_len} + {pad}!\")\n",
+    "    elif rest % stride != 0:\n",
+    "        raise Exception(f\"Rest length {rest} and stride {stride} are conflicted!\")\n",
+    "    else:\n",
+    "        return rest / stride + 1\n",
+    "\n",
+    "class Convolution:\n",
+    "    # 今回はフィルターのチャンネル数を3で固定している\n",
+    "    @beartype\n",
+    "    def __init__(self, W: NDArray[Shape['FN,3,FH,FW'],Float], b: NDArray[Shape['FN'], Float], stride=1, pad=0):\n",
+    "        # FN: Filter Number\n",
+    "        self.W = W\n",
+    "        self.b = b\n",
+    "        self.stride = stride\n",
+    "        self.pad = pad\n",
+    "\n",
+    "        # 中間データ（backward時に使用）\n",
+    "        self.x = None   \n",
+    "        self.col = None\n",
+    "        self.col_W = None\n",
+    "        \n",
+    "        # 重み・バイアスパラメータの勾配\n",
+    "        self.dW = None\n",
+    "        self.db = None\n",
+    "\n",
+    "    @beartype\n",
+    "    def forward(self, x: NDArray[Shape['N,3,H,W'], Float]):\n",
+    "        FN, C, FH, FW = self.W.shape\n",
+    "        N, C, H, W = x.shape\n",
+    "        out_h = output_size(H, self.pad, FH, self.stride)\n",
+    "        out_w = output_size(W, self.pad, FW, self.stride)\n",
+    "\n",
+    "        col: NDArray[Shape['N*out_h*out_w,3*FH*FW'], Float] = im2col(x, FH, FW, self.stride, self.pad)\n",
+    "        col_W: NDArray[Shape['3*FH*FW,FN'], Float] = self.W.reshape(FN, -1).T\n",
+    "        out: NDArray[Shape['N*out_h*out_w,FN', Float]] = np.dot(col, col_W) + self.b\n",
+    "        reshaped_out: NDArray[Shape['N,FN,out_h,out_w'], float] = out.reshape(N, out_h, out_w, FN).transpose(0,3,1,2)\n",
+    "\n",
+    "        self.x = x\n",
+    "        self.col = col\n",
+    "        self.col_W = col_W\n",
+    "\n",
+    "        return reshaped_out\n",
+    "\n",
+    "    @beartype\n",
+    "    def backward(self, dout: NDArray[Shape['N,FN,out_h,out_w'], Float]):\n",
+    "        FN, C, FH, FW = self.W.shape\n",
+    "        dout_matrix: NDArray[Shape['N*out_h*out_w,FN'], Float] = dout.transpose(0,2,3,1).reshape(-1, FN)\n",
+    "\n",
+    "        self.db: NDArray[Shape['1,N'], Float] = np.sum(dout_matrix, axis=0)\n",
+    "        dW_matrix: NDArray[Shape['C*FH*FW,FN'], Float] = np.dot(self.col.T, dout_matrix)\n",
+    "        self.dW = self.dW_matrix.transpose(1, 0).reshape(FN, C, FH, FW)\n",
+    "\n",
+    "        dcol: NDArray[Shape['N*out_h*out_w,3*FH*FN'], Float] = np.dot(dout_matrix, self.col_W.T)\n",
+    "        dx: NDArray[Shape['N,3,H,W']] = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)\n",
+    "\n",
+    "        return dx\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Pooling:\n",
+    "    def __init__(self, pool_h, pool_w, stride=2, pad=0):\n",
+    "        # pool_h, pool_wはそれぞれプーリング適用領域の高さ・幅。例えば3x3=9からmaxを取るなら、pool_h=3, pool_w=3\n",
+    "        self.pool_h = pool_h\n",
+    "        self.pool_w = pool_w\n",
+    "        self.stride = stride\n",
+    "        self.pad = pad\n",
+    "\n",
+    "    @beartype\n",
+    "    def forward(self, x: NDArray[Shape['N,C,H,W'], Float]):\n",
+    "        # 出力特徴マップの奥行きを、対象が色ではないのにチャンネルと呼ぶのは個人的にまだ違和感があるが、そのうち慣れる。\n",
+    "        N, C, H, W = x.shape\n",
+    "        out_h = int(1 + (H - self.pool_h) / self.stride)\n",
+    "        out_w = int(1 + (W - self.pool_w) / self.stride)\n",
+    "\n",
+    "        col: NDArray[Shape['N,C,H*W'], Float] = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)\n",
+    "        reshaped_col: NDArray[Shape['N*C,pool_h*pool_w'], Float] = out.reshaped(N*C, self.pool_h*self.pool_w)\n",
+    "        out: NDArray[Shape['N*C,1'], Float] = np.max(col, axis=1)\n",
+    "        reshaped_out: NDArray[Shape['N,C,out_h,out_w'], Float] = out.reshape(N, out_h, out_w, C).transpose(0,3,1,2)\n",
+    "\n",
+    "        return reshaped_out\n",
+    "\n",
+    "    @beartype\n",
+    "    def backward(self, dout: NDArray[Shape['N,C,out_h,out_w'], Float]):\n",
+    "        # TODO\n",
+    "        return dx\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rectifyは電流の交流を整流にすることから名付けられた。電流の交流は正負の電流が交互に流れるが、整流にすると正の電流のみが流れる。\n",
+    "class Relu:\n",
+    "    def __init__(self):\n",
+    "        self.mask = None\n",
+    "\n",
+    "    @beartype\n",
+    "    def forward(self, x: NDArray[Shape['N'], Float]):\n",
+    "        self.mask = (x <= 0)\n",
+    "        out = x.copy()\n",
+    "        out[self.mask] = 0\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "    @beartype\n",
+    "    def backward(self, dout: NDArray[Shape['N'], Float]):\n",
+    "        dout[self.mask] = 0\n",
+    "        dx = dout\n",
+    "\n",
+    "        return dx\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Affine:\n",
+    "    def __init__(self, W: NDArray[Shape['S,WS'], Float], b: NDArray[Shape['D'], Float]):\n",
+    "        self.W = W\n",
+    "        self.b = b\n",
+    "        self.x = None\n",
+    "        self.dW = None\n",
+    "        self.db = None\n",
+    "\n",
+    "    @beartype\n",
+    "    def forward(self, x: NDArray[Shape['N,S'], Float]):\n",
+    "        self.x = x\n",
+    "        out = np.dot(x, self.W) + self.b\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "    @beartype\n",
+    "    def backward(self, dout: NDArray[Shape['N,WS'], Float]):\n",
+    "        dx: NDArray[Shape['N,S'], Float] = np.dot(dout, self.W.T)\n",
+    "        self.dW: NDArray[Shape['S,WS'], Float] = np.dot(self.x.T, dout)\n",
+    "        self.db: NDArray[Shape['1'], Float] = np.sum(dout, axis=0)\n",
+    "\n",
+    "        return dx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 畳み込み層 → 全結合層(ReLU) → 全結合層(Softmax)を想定\n",
+    "\n",
+    "class SimpleConvNet:\n",
+    "    def __init__(self, input_dim=(1,28,28),\n",
+    "        # filter_size:5は、5x5を表す。正方形がメジャー。\n",
+    "        conv_param={'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1},\n",
+    "        hidden_size=100, output_size=10, weight_init_std=0.01):\n",
+    "        filter_num = conv_param['filter_num']\n",
+    "        filter_size = conv_param['filter_size']\n",
+    "        filter_pad = conv_param['pad']\n",
+    "        filter_stride = conv_param['stride']\n",
+    "        input_size = input_dim[1]\n",
+    "        conv_output_size = int(1 + (input_size + 2*filter_pad - filter_size) / filter_stride)\n",
+    "        pool_output_size = int(filter_num * (conv_output_size/2) * (conv_output_size/2))\n",
+    "\n",
+    "        self.params = {}\n",
+    "        self.params['W1']: NDArray[Shape['FN,C,FS,FS'],Float] = weight_init_std * np.random.randn(filter_num, input_dim[0], filter_size, filter_size)\n",
+    "        self.params['b1'] = np.zeros[filter_num]\n",
+    "        self.params['W2']: NDArray[Shape['PS,HS']] = weight_init_std * np.random.randn(pool_output_size, hidden_size)\n",
+    "        self.params['b2'] = np.zeros[hidden_size]\n",
+    "        self.params['W3']: NDArray[Shape['HS,OS']] = weight_init_std * np.random.randn(hidden_size, output_size)\n",
+    "        self.params['b3'] = np.zeros[output_size]\n",
+    "\n",
+    "        self.layers = OrderedDict()\n",
+    "        self.layers['Conv1'] = Convolution(self.params['W1'], self.params['b1'], conv_param['stride'], conv_param['pad'])\n",
+    "        self.layers['Relu1'] = Relu()\n",
+    "        self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2)\n",
+    "        self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2'])\n",
+    "        self.layers['Relu2'] = Relu()\n",
+    "        self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3'])\n",
+    "\n",
+    "        self.last_layer = SoftmaxWithLoss()\n",
+    "\n",
+    "    def predict(self, x: NDArray):\n",
+    "        for layer in self.layers.values():\n",
+    "            x = layer.forward(x)\n",
+    "        return x\n",
+    "\n",
+    "    def loss(self, x, t):\n",
+    "        y = self.predict(x)\n",
+    "        return self.last_layer.forward(y,t)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "til-machine-learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}