From abe25d09a297467497120f37c6bc43dc0c1ff05d Mon Sep 17 00:00:00 2001 From: Alphonsus Adu-Bredu Date: Fri, 17 Apr 2020 13:59:49 -0400 Subject: [PATCH] uploaded ranking notebook --- .../rank_paired_scores.ipynb | 734 ++++++++++++++++++ 1 file changed, 734 insertions(+) create mode 100644 neural_net_training_and_inference/rank_paired_scores.ipynb diff --git a/neural_net_training_and_inference/rank_paired_scores.ipynb b/neural_net_training_and_inference/rank_paired_scores.ipynb new file mode 100644 index 0000000..319e50a --- /dev/null +++ b/neural_net_training_and_inference/rank_paired_scores.ipynb @@ -0,0 +1,734 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import trueskill as ts" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "scores_df = pd.read_csv('aug_depressing.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4030216, 3)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
leftrightwinner
050e5f7d4d7c3df413b00056d50f43537fdc9f065f0002d0eleft
150e5f7d4d7c3df413b00056d50f42c09fdc9f065f000175cright
250e5f7d4d7c3df413b00056d50f4352cfdc9f065f0002c2bleft
350e5f7d4d7c3df413b00056d50e748e4d7c3df413b0013cbright
450e5f7d4d7c3df413b00056d50f447d2fdc9f065f0003a81right
\n", + "
" + ], + "text/plain": [ + " left right winner\n", + "0 50e5f7d4d7c3df413b00056d 50f43537fdc9f065f0002d0e left\n", + "1 50e5f7d4d7c3df413b00056d 50f42c09fdc9f065f000175c right\n", + "2 50e5f7d4d7c3df413b00056d 50f4352cfdc9f065f0002c2b left\n", + "3 50e5f7d4d7c3df413b00056d 50e748e4d7c3df413b0013cb right\n", + "4 50e5f7d4d7c3df413b00056d 50f447d2fdc9f065f0003a81 right" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Making a list of all unique image ids" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8060432" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_images = scores_df['left']\n", + "all_images = list(all_images)\n", + "all_images+= list(scores_df['right'])\n", + "len(all_images)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "all_images = list(set(all_images))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101676" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_images)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "image_ranks={}\n", + "for img in all_images:\n", + " image_ranks[img] = ts.Rating()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101676" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(image_ranks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ranking image pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "100000\n", + "200000\n", + "300000\n", + "400000\n", + "500000\n", + "600000\n", + "700000\n", + "800000\n", + "900000\n", + "1000000\n", + "1100000\n", + "1200000\n", + "1300000\n", + "1400000\n", + "1500000\n", + "1600000\n", + "1700000\n", + "1800000\n", + "1900000\n", + "2000000\n", + "2100000\n", + "2200000\n", + "2300000\n", + "2400000\n", + "2500000\n", + "2600000\n", + "2700000\n", + "2800000\n", + "2900000\n", + "3000000\n", + "3100000\n", + "3200000\n", + "3300000\n", + "3400000\n", + "3500000\n", + "3600000\n", + "3700000\n", + "3800000\n", + "3900000\n", + "4000000\n" + ] + } + ], + "source": [ + "for index, row in scores_df.iterrows():\n", + " l = row['left']; r = row['right']; w = row['winner']\n", + " if w == 'left':\n", + " image_ranks[l], image_ranks[r] = ts.rate_1vs1(image_ranks[l], image_ranks[r])\n", + " elif w == 'right':\n", + " image_ranks[r], image_ranks[l] = ts.rate_1vs1(image_ranks[r], image_ranks[l])\n", + " if index%100000 == 0:\n", + " print(index)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "101676" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(image_ranks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing ranked images to pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000\n", + "20000\n", + "30000\n", + "40000\n", + "50000\n", + "60000\n", + "70000\n", + "80000\n", + "90000\n", + "100000\n" + ] + } + ], + "source": [ + "column_names = ['image','mean_rank','std_rank']\n", + "rank_df = pd.DataFrame(columns = column_names)\n", + "index = 0\n", + "for key in image_ranks:\n", + " img = key\n", + " mu = image_ranks[key].mu\n", + " sigma = image_ranks[key].sigma\n", + " rank_df.loc[index] = [img, mu, sigma]\n", + " index += 1\n", + " if (index%10000 == 0):\n", + " print(index)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imagemean_rankstd_rank
050f43514fdc9f065f0002ac523.6857120.897259
150f44800fdc9f065f0003beb26.6518150.978426
251414c52fdc9f049260072e128.8242660.908291
351392705818ede129600109f25.1810590.890290
451414fcbfdc9f04926007ba826.7778190.940551
\n", + "
" + ], + "text/plain": [ + " image mean_rank std_rank\n", + "0 50f43514fdc9f065f0002ac5 23.685712 0.897259\n", + "1 50f44800fdc9f065f0003beb 26.651815 0.978426\n", + "2 51414c52fdc9f049260072e1 28.824266 0.908291\n", + "3 51392705818ede129600109f 25.181059 0.890290\n", + "4 51414fcbfdc9f04926007ba8 26.777819 0.940551" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rank_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(101676, 3)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rank_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "rank_df.sort_values('mean_rank',inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imagemean_rankstd_rank
31218513f29a5fdc9f0358700d3e011.5016654.391888
51261513f3357fdc9f0358700e13f11.7250683.997632
7507513cdf61fdc9f0358700212511.7745033.991489
62687513d56ebfdc9f0358700308411.7895854.052904
96872513d67e5fdc9f035870041bb11.8069574.065757
\n", + "
" + ], + "text/plain": [ + " image mean_rank std_rank\n", + "31218 513f29a5fdc9f0358700d3e0 11.501665 4.391888\n", + "51261 513f3357fdc9f0358700e13f 11.725068 3.997632\n", + "7507 513cdf61fdc9f03587002125 11.774503 3.991489\n", + "62687 513d56ebfdc9f03587003084 11.789585 4.052904\n", + "96872 513d67e5fdc9f035870041bb 11.806957 4.065757" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rank_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "rank_df.to_csv('ranked_depressing_images.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imagemean_rankstd_rank
932715140c9f5fdc9f049260026bf38.6245613.888543
64695513d61bafdc9f03587003ea338.6020803.908796
83864513d5e13fdc9f035870039dd38.2235703.902311
46509513cdc10fdc9f0358700206c38.1869454.139627
400025140cccdfdc9f04926002ea638.1465404.046895
\n", + "
" + ], + "text/plain": [ + " image mean_rank std_rank\n", + "93271 5140c9f5fdc9f049260026bf 38.624561 3.888543\n", + "64695 513d61bafdc9f03587003ea3 38.602080 3.908796\n", + "83864 513d5e13fdc9f035870039dd 38.223570 3.902311\n", + "46509 513cdc10fdc9f0358700206c 38.186945 4.139627\n", + "40002 5140cccdfdc9f04926002ea6 38.146540 4.046895" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rank_df.sort_values('mean_rank',inplace=True, ascending=False)\n", + "rank_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "shuffled_df = rank_df.sample(frac=1)\n", + "traindf = shuffled_df.iloc[:81340,:]\n", + "testdf = shuffled_df.iloc[81340:,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "traindf.to_csv('ranked_depressing_train.csv',index=False)\n", + "testdf.to_csv('ranked_depressing_test.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20336, 3)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testdf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(81340, 3)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "traindf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}