TIE-76: Install Build Pipeline (#34)

* remove model from repo * add new workflows * reformat steps * disable type checking * fix build yaml syntax * fix yaml syntax again * update step names * patch python version used for python tests * update formatting to align with ruff * address formatting issues * fix unit tests * test readme edit * add paths to `pull_request` clause * test readme edit again * fix typo * only run tests when changes are made to source code * test readme edit * test readme edit * test source edit * test source edit * test build workflow * update paths * test readme edit * update readme * text external readme * update readme * update test paths
center-for-threat-informed-defense · Jul 19, 2024 · 69e7932 · 69e7932
1 parent 6e4fa04
commit 69e7932
Show file tree

Hide file tree

Showing 13 changed files with 92 additions and 61 deletions.
diff --git a/.github/workflows/build_website.yml b/.github/workflows/build_website.yml
@@ -3,7 +3,13 @@ name: Build Website
 on:
   push:
     branches: [main]
+    paths:
+      - 'src/**'
+      - 'data/**/*.json'
   pull_request:
+    paths:
+      - 'src/**'
+      - 'data/**/*.json'
   workflow_dispatch:
 
 # If another web build starts for the same branch, cancel the previous build. This
@@ -26,35 +32,55 @@ jobs:
       BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
     steps:
 
-    # Configure Environment
-    - uses: actions/checkout@v3
-    - uses: actions/setup-node@v3
-      id: "setup-node"
-      with:
-        node-version: '19'
-        cache: 'npm'
-        cache-dependency-path: 'src/tie-web-interface/package-lock.json'
-    - name: Install dependencies
-      working-directory: src/tie-web-interface/
-      run: npm ci
+      # Configure Environment
+      - uses: actions/checkout@v3
 
-    # Lint
-    - name: Lint
-      working-directory: src/tie-web-interface/
-      run: npm run lint
+      # Configure Node
+      - uses: actions/setup-node@v3
+        id: "setup-node"
+        with:
+          node-version: '19'
+          cache: 'npm'
+          cache-dependency-path: 'src/tie-web-interface/package-lock.json'
+      - name: Install dependencies
+        working-directory: src/tie-web-interface/
+        run: npm ci
+
+      # Lint
+      - name: Lint
+        working-directory: src/tie-web-interface/
+        run: npm run lint
 
-    # Build and Upload Artifact
-    - name: Type Check
-      working-directory: src/tie-web-interface/
-      run: npm run type-check
-    - name: Build
-      working-directory: src/tie-web-interface/
-      run: npm run build-only -- --base /$BRANCH_NAME/
-    - name: Upload artifact
-      uses: actions/upload-artifact@v3
-      with:
-        name: tie_website
-        path: src/tie-web-interface/dist/
+      # Run Type Checks
+      - name: Type Check
+        working-directory: src/tie-web-interface/
+        run: npm run type-check
+
+      # Configure Python
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11.8'
+      - name: Install Poetry
+        run: curl -sSL https://install.python-poetry.org/ | python -
+      - name: Add Poetry to PATH
+        run: echo "$HOME/.poetry/bin" >> $GITHUB_PATH
+      - name: Install dependencies
+        run: poetry install
+
+      # Train Model
+      - name: Retrain Technique Inference Engine Model
+        working-directory: src/tie-web-interface/
+        run: npm run build-model
+
+      # Build Website
+      - name: Compile Website
+        working-directory: src/tie-web-interface/
+        run: npm run build-only -- --base /$BRANCH_NAME/
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: tie_website
+          path: src/tie-web-interface/dist/
 
   # Publish to Azure blob only on PRs, not main.
   azure_blob:

diff --git a/.github/workflows/python_test.yml b/.github/workflows/python_test.yml
@@ -3,7 +3,11 @@ name: Python Tests
 on:
   push:
     branches: [main]
+    paths:
+      - 'src/**'
   pull_request:
+    paths:
+      - 'src/**'
 
 jobs:
   test:
@@ -13,7 +17,7 @@ jobs:
     - uses: actions/checkout@v3
     - uses: actions/setup-python@v4
       with:
-        python-version: '3.8'
+        python-version: '3.11.8'
     - name: Install Poetry
       run: curl -sSL https://install.python-poetry.org/ | python -
     - name: Add Poetry to PATH
@@ -24,7 +28,7 @@ jobs:
       run: poetry run ruff format --check src/
     - name: Check imports formatting
       run: poetry run ruff check src/
-    - name: Check type annotations
+    # - name: Check type annotations
       # run: poetry run mypy --check src/
     - name: Run unit tests
       run: poetry run make test-ci

diff --git a/src/tie-web-interface/.gitignore b/src/tie-web-interface/.gitignore
@@ -28,3 +28,6 @@ coverage
 *.sw?
 
 *.tsbuildinfo
+
+# Ignore Model
+app.trained.model.zip
diff --git a/src/tie-web-interface/public/app.trained.model.zip b/src/tie-web-interface/public/app.trained.model.zip
diff --git a/src/tie-web-interface/src/App.vue b/src/tie-web-interface/src/App.vue
@@ -11,6 +11,7 @@
 </template>
 
 <script lang="ts">
+// Test
 // Dependencies
 import { defineComponent, provide } from "vue";
 import { useInferenceEngineStore } from "./stores/InferenceEngineStore";

diff --git a/src/tie/engine.py b/src/tie/engine.py
@@ -1,12 +1,13 @@
 import copy
+
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 
 from tie.constants import PredictionMethod
+from tie.exceptions import TechniqueNotFoundException
 from tie.matrix import ReportTechniqueMatrix
 from tie.recommender import Recommender
-from tie.exceptions import TechniqueNotFoundException
 from tie.utils import (
     get_mitre_technique_ids_to_names,
     normalized_discounted_cumulative_gain,

diff --git a/src/tie/matrix_builder.py b/src/tie/matrix_builder.py
@@ -131,8 +131,9 @@ def build_train_test_validation(
         interactions.
 
         Ensures that each report has at least one technique example.
-        To support this, requires the number of reports m > (1-test_ratio-validation_ratio) * num_observations,
-        where num_observations is the number of observed report-technique interactions.
+        To support this, requires the number of reports
+        m > (1-test_ratio-validation_ratio) * num_observations, where num_observations
+        is the number of observed report-technique interactions.
 
         Args:
             test_ratio: The ratio of positive interactions to include in the test
@@ -153,19 +154,23 @@ def build_train_test_validation(
         data = self.build()
 
         num_observations = data.to_numpy().sum()
-        # make sure that we have enough observations to at least provide a single one per report
+        # make sure that we have enough observations
+        # to at least provide a single one per report
         assert data.m <= num_observations * (1 - test_ratio - validation_ratio)
-        # use floor since we need to have at least one example in the training set for each report
-        # may mean slightly less (by 1) items in test or validation set
+        # use floor since we need to have at least one example in the training set for
+        # each report may mean slightly less (by 1) items in test or validation set
         num_validation_samples = math.floor(validation_ratio * num_observations)
         num_test_samples = math.floor(test_ratio * num_observations)
 
-        # stategy:
-        # sample one index per row to make sure we have at least one training item per row
-        # remove these from the indices available from which to sample the test and validation data
-        # sample test and validation data
-        # training data = all indices - test indices - validation indices
-        # make sure to sample at least one index per row by splitting indices by row and sampling from each
+        # Strategy:
+        # - sample one index per row to make sure we have at least one training item per
+        #   row
+        # - remove these from the indices available from which to sample the test and
+        #   validation data
+        # - sample test and validation data
+        # - training data = all indices - test indices - validation indices
+        # - make sure to sample at least one index per row by splitting indices by row
+        #   and sampling from each
         indices_by_row = {index[0]: [] for index in data.indices}
         for index in data.indices:
             row, _ = index

diff --git a/src/tie/recommender/implicit_bpr_recommender.py b/src/tie/recommender/implicit_bpr_recommender.py
@@ -89,9 +89,10 @@ def fit(
             data: An mxn tensor of training data.
             learning_rate: The learning rate.
                 Requires learning_rate > 0.
-            epochs: Number of training epochs, where each the model is trained on the cardinality
-                dataset in each epoch.
-            regularization_coefficient: Coefficient on the embedding regularization term.
+            epochs: Number of training epochs, where each the model is trained on the
+                cardinality dataset in each epoch.
+            regularization_coefficient: Coefficient on the embedding regularization
+                term.
 
         Mutates:
             The recommender to the new trained state.

diff --git a/src/tie/recommender/implicit_wals_recommender.py b/src/tie/recommender/implicit_wals_recommender.py
@@ -84,8 +84,8 @@ def fit(
 
         Args:
             data: an mxn tensor of training data.
-            epochs: number of training epochs, where each the model is trained on the cardinality
-                dataset in each epoch.
+            epochs: number of training epochs, where each the model is trained on the
+                cardinality dataset in each epoch.
             c: weight for negative training examples.  Requires 0 < c < 1.
             regularization_coefficient: coefficient on the embedding regularization
                 term.

diff --git a/src/tie/recommender/wals_recommender.py b/src/tie/recommender/wals_recommender.py
@@ -210,7 +210,6 @@ def fit(
         alpha = (1 / c) - 1
 
         for _ in range(epochs):
-
             # step 1: update U
             self._U = self._update_factor(
                 self._V, P.T, alpha, regularization_coefficient

diff --git a/src/tie/utils.py b/src/tie/utils.py
@@ -78,7 +78,8 @@ def precision_at_k(predictions: pd.DataFrame, test_data: pd.DataFrame, k: int) -
         k: the number of predictions to include in the top k.  Requires 0 < k <= n.
 
     Returns:
-        The computed precision for the top k predictions, or np.nan if the test set is empty.
+        The computed precision for the top k predictions, or np.nan if the test set is
+        empty.
     """
     m, n = test_data.shape
     assert m > 0
@@ -113,7 +114,8 @@ def recall_at_k(predictions: pd.DataFrame, test_data: pd.DataFrame, k: int) -> f
         k: the number of predictions to include in the top k.  Requires 0 < k <= n.
 
     Returns:
-        The computed recall for the top k predictions, or np.,nan if the test set is empty.
+        The computed recall for the top k predictions, or np.,nan if the test set is
+        empty.
     """
     m, n = test_data.shape
     assert m > 0

diff --git a/tests/test_init.py b/tests/test_init.py
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,7 +1,7 @@
 import unittest
 import math
 import pandas as pd
-from models import utils
+import tie.utils as utils
 import numpy as np
 from sklearn.metrics import ndcg_score