added week0_03 materials

Binpord · Mar 1, 2021 · 0e30b8d · 0e30b8d
1 parent 5e303cb
commit 0e30b8d
Show file tree

Hide file tree

Showing 3 changed files with 819 additions and 0 deletions.
diff --git a/week0_03_linear_classification/README.md b/week0_03_linear_classification/README.md
@@ -0,0 +1,2 @@
+PyTorch intro:
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/ml-mipt/blob/basic_f20/week0_03_linear_classification/week0_03_intro_to_pytorch.ipynb)
diff --git a/week0_03_linear_classification/notmnist.py b/week0_03_linear_classification/notmnist.py
@@ -0,0 +1,43 @@
+import os
+import numpy as np
+from matplotlib.pyplot import imread
+from sklearn.model_selection import train_test_split
+from glob import glob
+
+def load_notmnist(path='./notMNIST_small',letters='ABCDEFGHIJ',
+                  img_shape=(28,28),test_size=0.25,one_hot=False):
+
+    # download data if it's missing. If you have any problems, go to the urls and load it manually.
+    if not os.path.exists(path):
+        if not os.path.exists('./notMNIST_small.tar.gz'):
+            print("Downloading data...")
+            assert os.system('curl http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz > notMNIST_small.tar.gz') == 0
+        print("Extracting ...")
+        assert os.system('tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0
+
+    data,labels = [],[]
+    print("Parsing...")
+    for img_path in glob(os.path.join(path,'*/*')):
+        class_i = img_path.split(os.sep)[-2]
+        if class_i not in letters: continue
+        try:
+            data.append(imread(img_path))
+            labels.append(class_i,)
+        except:
+            print("found broken img: %s [it's ok if <10 images are broken]" % img_path)
+
+    data = np.stack(data)[:,None].astype('float32')
+    data = (data - np.mean(data)) / np.std(data)
+
+    #convert classes to ints
+    letter_to_i = {l:i for i,l in enumerate(letters)}
+    labels = np.array(list(map(letter_to_i.get, labels)))
+
+    if one_hot:
+        labels = (np.arange(np.max(labels) + 1)[None,:] == labels[:, None]).astype('float32')
+
+    #split into train/test
+    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42)
+
+    print("Done")
+    return X_train, y_train, X_test, y_test
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		PyTorch intro:
		[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/ml-mipt/blob/basic_f20/week0_03_linear_classification/week0_03_intro_to_pytorch.ipynb)