ex 4

trusthlt · Nov 13, 2024 · 0ee7d16 · 0ee7d16
1 parent 53f1b0e
commit 0ee7d16
Show file tree

Hide file tree

Showing 8 changed files with 631 additions and 0 deletions.
diff --git a/exercises/ex04/.gitignore b/exercises/ex04/.gitignore
@@ -0,0 +1,6 @@
+build
+venv
+nlpwdlfw.egg-info
+.idea
+__pycache__
+dist
diff --git a/exercises/ex04/README.md b/exercises/ex04/README.md
@@ -0,0 +1,60 @@
+# NLPwDL 2024/26 Exercise 04
+
+Gradient of a log-linear function with input (features), binary output value (gold label), and logistic loss
+
+Note: This repository re-uses our code from Exercise 3, such as the ScalarNode and efficient implementation of backpropagation with caching.
+
+We renamed the unit test file `test_nodes.py` from Exercise 2 to `test_tasks_ex2.py`. These unit tests must keep working!
+
+Changes from ex03: I renamed `arguments` in `ScalarNode` to `children`. We will use not only the arguments of a function (e.g., $x$ is an argument of $y = a x + b$) but also the parameters (e.g., $a$ is a parameter in the previous example). Both arguments and parameters are children of a node and thus must smoothly propagate gradients during backpropagation.
+
+## Tasks
+
+### Task 0 (warm-up, do at home before the exercise class)
+
+Implement a new scalar node (named ParameterNode) which is almost similar to a ConstantNode but its value can be changed.
+
+### Task 1
+
+Implement a linear function node (compute the output value)
+
+Recall: Linear function $y = f(x_1, x_2, ..., x_n)$ has $n$ weight parameters $w_1, ... w_n$ and a single bias parameter $b$.
+
+$y = w_1 x_1 + w_2 x_2 + ... + w_n x_n + b$
+
+I extended `ScalarNode` and created a `LinearNode`. The arguments `x_1, ... x_n` of `LinearNode` will again be just a list of other nodes. However, we will also pass a list of parameters `w_1, ... w_n` and `b` which should be created using the `ParameterNode`.
+
+### Task 2
+
+Implement the rest of the linear function node, namely the partial derivatives.
+
+### Task 3
+
+Implement a sigmoid node
+
+### Task 4
+
+Implement a per-example binary logistic loss (cross-entropy loss)
+
+### Task 5
+
+Implement updating the parameters by taking the step determined by the gradient
+
+## Solutions
+
+Will be included in the codebase in the next exercise.
+
+## Setup
+
+Create a virtual environment
+
+```bash
+$ virtualenv venv
+$ source venv/bin/activate
+```
+
+Run unittests from the command line
+
+```bash
+$ python -m unittest
+```
diff --git a/exercises/ex04/nlpwdlfw/__init__.py b/exercises/ex04/nlpwdlfw/__init__.py
diff --git a/exercises/ex04/nlpwdlfw/nodes.py b/exercises/ex04/nlpwdlfw/nodes.py
@@ -0,0 +1,293 @@
+import math
+from typing import List
+
+
+class ScalarNodeCache:
+    value = None
+    local_partial_derivatives_wrt_children = None
+    global_derivative_wrt_self = None
+
+
+class ScalarNode:
+
+    def __init__(self, children: List['ScalarNode']) -> None:
+        # We need to "wire-up" all the children nodes. Each of them must know its parent
+        # (which is this node), otherwise they would not be part of the computational
+        # graph and backpropagation on them would not work.
+        self._parents = []
+        self._children = children
+        for child in self._children:
+            child._parents.append(self)
+
+        # Empty the cache
+        self._cache = ScalarNodeCache()
+
+    def value(self) -> float:
+        raise NotImplementedError()
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        raise NotImplementedError()
+
+    def find_self_position_in_parents_children(self, parent: 'ScalarNode') -> int:
+        for i, child in enumerate(parent._children):
+            if self == child:
+                return i
+
+        raise Exception("Self found not in parent's children")
+
+    def global_derivative_wrt_self(self) -> float:
+        # Look up in the cache first
+        if self._cache.global_derivative_wrt_self is not None:
+            return self._cache.global_derivative_wrt_self
+
+        if len(self._parents) == 0:
+            # no parent, this must be the output node, and d out/d out = 1.0
+            return 1.0
+        else:
+            result = 0.0
+            # Generalized chain rule: For each parent, get its "global" derivative and multiply by its partial
+            # derivative with respect to this node; sum the products up
+            for p in self._parents:
+                index_in_parents_children = self.find_self_position_in_parents_children(p)
+                parent_to_self_derivative = p.local_partial_derivatives_wrt_children()[index_in_parents_children]
+                parent_global_derivative = p.global_derivative_wrt_self()
+
+                result += parent_to_self_derivative * parent_global_derivative
+
+            # Save to the cache
+            self._cache.global_derivative_wrt_self = result
+
+            return result
+
+    def reset_cache(self) -> None:
+        self._cache = ScalarNodeCache()
+
+
+    def update_parameters_by_gradient_step(self, learning_rate: float) -> None:
+        # Update if trainable parameter; this is little hacky, we should implement this part
+        # simply as a method in ParameterNode, but it's here for simplicity to keep the update
+        # at one place
+        if isinstance(self, ParameterNode):
+            # --- TODO TASK_5 ---
+            pass
+
+            # --- TASK_5 ---
+
+        # And call it recursively on all children
+        for child in self._children:
+            child.update_parameters_by_gradient_step(learning_rate)
+
+    def clean_cache_recursively(self) -> None:
+        # We need to do a certain operation with every node
+        # --- TODO TASK_5 ---
+        pass
+
+        # --- TASK_5 ---
+
+        # And call it recursively on all children
+        for child in self._children:
+            child.clean_cache_recursively()
+
+
+class ConstantNode(ScalarNode):
+
+    def __init__(self, value: float) -> None:
+        super().__init__([])
+        self._value = value
+
+    def value(self) -> float:
+        return self._value
+
+
+class SumNode(ScalarNode):
+
+    def value(self) -> float:
+        if self._cache.value is not None:
+            return self._cache.value
+
+        result = 0.0
+        # Sum all arguments' values
+        for child in self._children:
+            result += child.value()
+
+        # Save to the cache
+        self._cache.value = result
+
+        return result
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        # Partial derivative wrt. each argument is 1.0, for example
+        # y = w_1 + w_2 + w_3
+        # dy/dw_1 = 1
+        # dy/dw_2 = 1
+        # dy/dw_3 = 1
+
+        return [1.0] * len(self._children)
+
+
+class ProductNode(ScalarNode):
+
+    def value(self) -> float:
+        if self._cache.value is not None:
+            return self._cache.value
+
+        result = 1.0
+        # Multiply all arguments values
+        for child in self._children:
+            result *= child.value()
+
+        # Save to the cache
+        self._cache.value = result
+
+        return result
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        # Partial derivative wrt. each argument is a product of all other arguments, for example
+        # y = w_1 * w_2 * w_3
+        # dy/dw_1 = w_2 * w_3
+        # dy/dw_2 = w_1 * w_3
+        # dy/dw_3 = w_1 * w_2
+
+        if self._cache.local_partial_derivatives_wrt_children is not None:
+            return self._cache.local_partial_derivatives_wrt_children
+
+        # zero-filled result
+        result = [0.0] * len(self._children)
+
+        # For each i-th argument, compute the product of all other arguments
+        for i in range(len(self._children)):
+            ith_result = 1.0
+            for j in range(len(self._children)):
+                if i != j:  # Skip the i-th argument in the product computation
+                    j_value = self._children[j].value()
+                    ith_result *= j_value
+            result[i] = ith_result
+
+        # Save to the cache
+        self._cache.local_partial_derivatives_wrt_arguments = result
+
+        return result
+
+
+class ParameterNode(ConstantNode):
+
+    def set_value(self, value: float) -> None:
+        # --- TODO TASK_0 ---
+        pass
+
+        # --- TASK_0 ---
+
+
+class LinearNode(ScalarNode):
+
+    def __init__(self, arguments: List[ScalarNode], weights: List[ParameterNode], bias: ParameterNode) -> None:
+        # This is an important but arbitrary design choice!
+        # We pack arguments, weights, and bias in a single list of children
+        super().__init__(arguments + weights + [bias])
+        self._arguments = arguments
+        self._weights = weights
+        self._bias = bias
+
+        # We must have the same number of weights as arguments
+        assert len(weights) == len(arguments)
+
+    def value(self) -> float:
+        if self._cache.value is not None:
+            return self._cache.value
+
+        result = 0.0
+
+        # --- TODO TASK_1 ---
+
+        # --- TASK_1 ---
+
+        # Save to the cache
+        self._cache.value = result
+
+        return result
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        if self._cache.local_partial_derivatives_wrt_children is not None:
+            return self._cache.local_partial_derivatives_wrt_children
+
+        result = None
+        # --- TODO TASK_2 ---
+
+        # --- TASK_2 ---
+
+        # Save to the cache
+        self._cache.local_partial_derivatives_wrt_arguments = result
+
+        return result
+
+
+class SigmoidNode(ScalarNode):
+
+    def __init__(self, argument: ScalarNode) -> None:
+        # Single-item list of children
+        super().__init__([argument])
+
+    def value(self) -> float:
+        if self._cache.value is not None:
+            return self._cache.value
+
+        result = 0.0
+        # --- TODO TASK_3 ---
+
+        # --- TASK_3 ---
+
+        # Save to the cache
+        self._cache.value = result
+
+        return result
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        if self._cache.local_partial_derivatives_wrt_children is not None:
+            return self._cache.local_partial_derivatives_wrt_children
+
+        result = []
+        # --- TODO TASK_3 ---
+
+        # --- TASK_3 ---
+
+        # Save to the cache
+        self._cache.local_partial_derivatives_wrt_arguments = result
+
+        return result
+
+
+class CrossEntropyLoss(ScalarNode):
+
+    def __init__(self, y_hat: ScalarNode, gold_label: ConstantNode) -> None:
+        # Single-item list of children
+        super().__init__([y_hat])
+
+        self._gold_label = gold_label
+
+    def value(self) -> float:
+        if self._cache.value is not None:
+            return self._cache.value
+
+        result = 0.0
+        # --- TODO TASK_4 ---
+
+        # --- TASK_4 ---
+
+        # Save to the cache
+        self._cache.value = result
+
+        return result
+
+    def local_partial_derivatives_wrt_children(self) -> List[float]:
+        if self._cache.local_partial_derivatives_wrt_children is not None:
+            return self._cache.local_partial_derivatives_wrt_children
+
+        result = []
+        # --- TODO TASK_4 ---
+
+        # --- TASK_4 ---
+
+        # Save to the cache
+        self._cache.local_partial_derivatives_wrt_arguments = result
+
+        return result
diff --git a/exercises/ex04/setup.py b/exercises/ex04/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+
+setup(
+    name='nlpwdlfw',
+    version='0.3',
+    packages=['nlpwdlfw'],
+    license='Apache License, Version 2.0',
+    author='Ivan Habernal',
+    description='NLPwDL Framework',
+    # enable unittest discovery
+    test_suite='tests',
+)
diff --git a/exercises/ex04/tests/__init__.py b/exercises/ex04/tests/__init__.py