cleaning up old stuff

bha38 · Nov 21, 2017 · 5d904ec · 5d904ec
1 parent 78af396
commit 5d904ec
Show file tree

Hide file tree

Showing 36 changed files with 858 additions and 499 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 **/.pyc
 **/.ipynb_checkpoints
+.ipynb_checkpoints
 **/.DS_Store
 **/.DS_Store?
 **/.h5

diff --git a/notebooks/explore/Untitled.ipynb b/notebooks/explore/Untitled.ipynb
@@ -0,0 +1,32 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Try CNN with input as 7x7, and then visualize these . filters. Make sure to scale it to 1-255"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/...oks/explore/CNN_features_prediction.ipynb → .../VGG_features_hyperparameter_search.ipynb b/...oks/explore/CNN_features_prediction.ipynb → .../VGG_features_hyperparameter_search.ipynb
@@ -5,7 +5,7 @@
    "metadata": {},
    "source": [
     "# Predictions based on features\n",
-    "Test models both on the 8 class and the two-class classification problems. Grid search for parameters for 2 and and 8 class.\n",
+    "Grid search for parameters for both 8-class and the 2-class classification problems\n",
     "\n",
     "This is based off of http://ieeexplore.ieee.org.proxy.lib.uwaterloo.ca/stamp/stamp.jsp?arnumber=7312934, where they use a five-fold cross validation with the 5 train and test sets already defined by them. The hyperparameter tuning is done using a random subset of the training set (without dividing by patient)\n",
     "\n",
@@ -19,6 +19,44 @@
     "Another method is to extract patches from the orignial image to train the neural net on. For testing the predictions of multiple patches can be averaged."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import glob\n",
+    "import random\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt \n",
+    "from matplotlib.pyplot import imshow\n",
+    "from IPython.display import display, HTML\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "% matplotlib inline\n",
+    "\n",
+    "\n",
+    "# Import modules every time you run code imported using %aimport\n",
+    "%load_ext autoreload\n",
+    "%autoreload 1\n",
+    "\n",
+    "# Add the src directory for functions\n",
+    "src_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src')\n",
+    "print(src_dir)\n",
+    "sys.path.append(src_dir)\n",
+    "\n",
+    "# import my functions:\n",
+    "%aimport models\n",
+    "from models import*\n",
+    "\n",
+    "# Base Directory where data is stored\n",
+    "base_data_dir = '/Users/rb/Documents/waterloo/projects/breakHis/'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -291,25 +329,14 @@
     }
    ],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "import glob\n",
-    "import random\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from PIL import Image\n",
-    "import matplotlib.pyplot as plt \n",
-    "from matplotlib.pyplot import imshow\n",
-    "from IPython.display import display, HTML\n",
-    "from sklearn.metrics import accuracy_score\n",
     "\n",
-    "% matplotlib inline\n",
     "\n",
     "sys.path.insert(0, '/Users/rb/Google_Drive/Waterloo/projects/breakHis/src')\n",
     "from models import*\n",
     "\n",
     "# load the train data\n",
-    "train_features = np.load('/Users/rb/Documents/waterloo/projects/breakHis/features/vgg/fold1/100/train/train_feat_vgg_100_aug1.npy')\n",
+    "train_dir = os.path.join(base_data_dir, 'features/vgg/fold1/100/train/train_feat_vgg_100_aug1.npy')\n",
+    "train_features = np.load(train_dir)\n",
     "print('train_features.shape', train_features.shape)\n",
     "\n",
     "y_train = train_features[:,:8]\n",
@@ -326,7 +353,8 @@
     "        y_bin_train[index, 1] = 1\n",
     "         \n",
     "# load the valid data\n",
-    "valid_features = np.load('/Users/rb/Documents/waterloo/projects/breakHis/features/vgg/fold1/100/valid/valid_feat_vgg_100_aug1.npy')\n",
+    "valid_dir = os.path.join(base_data_dir, 'features/vgg/fold1/100/valid/valid_feat_vgg_100_aug1.npy')\n",
+    "valid_features = np.load(valid_dir)\n",
     "print('valid_features.shape', valid_features.shape)\n",
     "y_valid = valid_features[:,:8]\n",
     "x_valid = valid_features[:,8:]\n",

diff --git a/notebooks/explore/baseline_feature_extraction.ipynb b/notebooks/explore/baseline_feature_extraction.ipynb
@@ -97,7 +97,6 @@
     "out_loc = os.path.join(base_data_dir, 'features', 'vgg') \n",
     "size = 100\n",
     "\n",
-    "# be lazy and do 1:\n",
     "n_folds = 6\n",
     "\n",
     "for i in range(1, n_folds, 1):\n",
@@ -130,7 +129,6 @@
     "\n",
     "    all_features = get_freatures_vgg(generator, cur_loc, samples=8, classes=8, batch_size=1)\n",
     "    np.save(os.path.join(new_loc, new_dir+'_feat_vgg_'+str(size)+'_aug1.npy'), all_features)\n",
-    "\n",
     "    \n",
     "for i in range(1, n_folds, 1):\n",
     "    new_dir = 'valid'\n",
@@ -161,7 +159,7 @@
     "\n",
     "    all_features = get_freatures_vgg(generator, cur_loc, samples=8, classes=8, batch_size=1)\n",
     "    np.save(os.path.join(new_loc, new_dir+'_feat_vgg_'+str(size)+'_aug1.npy'), all_features)\n",
-    "    \n",
+    "\n",
     "    \n",
     "for i in range(1, n_folds, 1):\n",
     "    new_dir = 'test'\n",

diff --git a/notebooks/final/test_vgg_features_cv.ipynb b/notebooks/final/test_vgg_features_cv.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cross Validation for models on VGG features \n",
+    "* Using the folds defined in http://ieeexplore.ieee.org.proxy.lib.uwaterloo.ca/stamp/stamp.jsp?arnumber=7312934\n",
+    "* For binary and and 8 class\n",
+    "* Using best hyperparameters found in VGG_features_hyperparameter search\n",
+    "* Training is done using the full training set, with no validation set.\n",
+    "* This was re-run on a newly created dataset compared to the hyperparameter search, because it got overly high accuracy for fold 1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n",
+      "/home/rbbidart/breakHis/src\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import glob\n",
+    "import random\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt \n",
+    "from matplotlib.pyplot import imshow\n",
+    "from IPython.display import display, HTML\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "% matplotlib inline\n",
+    "\n",
+    "\n",
+    "# Import modules every time you run code imported using %aimport\n",
+    "%load_ext autoreload\n",
+    "%autoreload 1\n",
+    "\n",
+    "# Add the src directory for functions\n",
+    "src_dir = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src')\n",
+    "print(src_dir)\n",
+    "sys.path.append(src_dir)\n",
+    "\n",
+    "# import my functions:\n",
+    "%aimport models\n",
+    "from models import*\n",
+    "%aimport functions\n",
+    "from functions import*\n",
+    "\n",
+    "# Base Directory where data is stored\n",
+    "base_data_dir = '/home/rbbidart/project/rbbidart/breakHis/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logistic Regression\n",
+    "* Binary C=.1\n",
+    "* 8-class C=1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fold  0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "model_2 = LogisticRegression(C=.1)\n",
+    "model_8 = LogisticRegression(C=1)\n",
+    "cv_features(model_2, model_8, base_data_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Random Forest\n",
+    "* Binary 160, 3\n",
+    "* 8-class 160, 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fold  0\n",
+      "Fold  1\n",
+      "Fold  2\n",
+      "Fold  3\n",
+      "Fold  4\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "model_2 = RandomForestClassifier(n_estimators=160, min_samples_split=3)\n",
+    "model_8 = RandomForestClassifier(n_estimators=160, min_samples_split=3)\n",
+    "cv_features(model_2, model_8, base_data_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## XGBoost\n",
+    "* Binary: n_estimators=200\tmax_depth=8\tlearning_rate=0.3\treg_lambda=2\n",
+    "* 8-class: n_estimators=250 max_depth=9 learning_rate=0.3\treg_lambda=2 (guess)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "model_2 = XGBClassifier(n_estimators=200, max_depth=8, learning_rate=0.3, reg_lambda=2)\n",
+    "model_8 = XGBClassifier(n_estimators=200, max_depth=8, learning_rate=0.3, reg_lambda=2)\n",
+    "cv_features(model_2, model_8, base_data_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## KNN\n",
+    "* 9 neighbours"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "\n",
+    "model_2 = KNeighborsClassifier(n_neighbors=9)\n",
+    "model_8 = KNeighborsClassifier(n_neighbors=9)\n",
+    "cv_features(model_2, model_8, base_data_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/._train_models_k.py b/src/._train_models_k.py
diff --git a/src/__pycache__/functions.cpython-35.pyc b/src/__pycache__/functions.cpython-35.pyc
diff --git a/src/__pycache__/models.cpython-35.pyc b/src/__pycache__/models.cpython-35.pyc