Merge pull request #139 from CDU-data-science-team/137_230908_models

137 230908 models
The-Strategy-Unit · Sep 12, 2023 · a4b7758 · a4b7758
2 parents 6221306 + 95ec0f3
commit a4b7758
Show file tree

Hide file tree

Showing 18 changed files with 121 additions and 113 deletions.
diff --git a/current_best_multilabel/bert_sentiment.txt → ...t_best_model/sentiment/bert_sentiment.txt b/current_best_multilabel/bert_sentiment.txt → ...t_best_model/sentiment/bert_sentiment.txt
@@ -1,8 +1,9 @@
 
  *****************
- Random state seed for train test split is: 639
+ Random state seed for train test split is: 75
 
 
+Model: "model"
 __________________________________________________________________________________________________
  Layer (type)                   Output Shape         Param #     Connected to
 ==================================================================================================
@@ -14,15 +15,18 @@ ________________________________________________________________________________
                                  hidden_states=None
                                 , attentions=None)
 
+ input_cat (InputLayer)         [(None, 1)]          0           []
+
  tf.__operators__.getitem (Slic  (None, 768)         0           ['distilbert[0][0]']
  ingOpLambda)
 
- input_cat (InputLayer)         [(None, 3)]          0           []
+ category_encoding (CategoryEnc  (None, 3)           0           ['input_cat[0][0]']
+ oding)
 
  pooled_output (Dropout)        (None, 768)          0           ['tf.__operators__.getitem[0][0]'
                                                                  ]
 
- dense (Dense)                  (None, 10)           40          ['input_cat[0][0]']
+ dense (Dense)                  (None, 10)           40          ['category_encoding[0][0]']
 
  concatenate (Concatenate)      (None, 778)          0           ['pooled_output[0][0]',
                                                                   'dense[0][0]']
@@ -33,34 +37,21 @@ ________________________________________________________________________________
 Total params: 66,366,815
 Trainable params: 66,366,815
 Non-trainable params: 0
+__________________________________________________________________________________________________
 
 
-Training time: 4:02:31
+Training time: 5:20:56
 
 
  Classification report:
                precision    recall  f1-score   support
 
-very positive       0.83      0.70      0.76      1253
-     positive       0.61      0.68      0.64       733
-      neutral       0.57      0.65      0.61       350
-     negative       0.68      0.84      0.75       377
-very negative       0.73      0.46      0.56       104
-
-     accuracy                           0.70      2817
-    macro avg       0.68      0.67      0.67      2817
- weighted avg       0.72      0.70      0.70      2817
-
-
- Combining 'very positive' and 'positive' together,
- 'negative' and 'very negative' together:
-
-              precision    recall  f1-score   support
-
-    positive       0.96      0.91      0.94      1986
-     neutral       0.57      0.65      0.61       350
-    negative       0.79      0.87      0.83       481
+very positive       0.80      0.79      0.80      1746
+     positive       0.63      0.52      0.57       841
+      neutral       0.52      0.71      0.60       551
+     negative       0.79      0.68      0.73       639
+very negative       0.52      0.64      0.57       166
 
-    accuracy                           0.87      2817
-   macro avg       0.77      0.81      0.79      2817
-weighted avg       0.88      0.87      0.88      2817
+     accuracy                           0.70      3943
+    macro avg       0.65      0.67      0.65      3943
+ weighted avg       0.71      0.70      0.70      3943
diff --git a/current_best_model/sentiment/confusion_matrix_3_counts.png b/current_best_model/sentiment/confusion_matrix_3_counts.png
diff --git a/current_best_model/sentiment/confusion_matrix_3_percentages.png b/current_best_model/sentiment/confusion_matrix_3_percentages.png
diff --git a/current_best_model/sentiment/confusion_matrix_5_counts.png b/current_best_model/sentiment/confusion_matrix_5_counts.png
diff --git a/current_best_model/sentiment/confusion_matrix_5_percentages.png b/current_best_model/sentiment/confusion_matrix_5_percentages.png
diff --git a/...t_best_multilabel/svc_minorcats_perf.xlsx → current_best_model/svc_minorcats_perf.xlsx b/...t_best_multilabel/svc_minorcats_perf.xlsx → current_best_model/svc_minorcats_perf.xlsx
diff --git a/current_best_multilabel/svc_minorcats_v5.sav → current_best_model/svc_minorcats_v5.sav b/current_best_multilabel/svc_minorcats_v5.sav → current_best_model/svc_minorcats_v5.sav
diff --git a/current_best_multilabel/svc_minorcats_v5.txt → current_best_model/svc_minorcats_v5.txt b/current_best_multilabel/svc_minorcats_v5.txt → current_best_model/svc_minorcats_v5.txt
@@ -1,6 +1,6 @@
 
  *****************
- Random state seed for train test split is: 299 
+ Random state seed for train test split is: 299
 
 
 Pipeline(steps=[('columntransformer',

diff --git a/...ent_best_multilabel/v6_framework/bert.txt → current_best_model/v6_framework/bert.txt b/...ent_best_multilabel/v6_framework/bert.txt → current_best_model/v6_framework/bert.txt
@@ -1,35 +1,35 @@
 
  *****************
- Random state seed for train test split is: 42 
+ Random state seed for train test split is: 42
 
 
 Model: "model"
 __________________________________________________________________________________________________
- Layer (type)                   Output Shape         Param #     Connected to                     
+ Layer (type)                   Output Shape         Param #     Connected to
 ==================================================================================================
- input_ids (InputLayer)         [(None, 150)]        0           []                               
-                                                                                                  
- distilbert (TFDistilBertMainLa  TFBaseModelOutput(l  66362880   ['input_ids[0][0]']              
- yer)                           ast_hidden_state=(N                                               
-                                one, 150, 768),                                                   
-                                 hidden_states=None                                               
-                                , attentions=None)                                                
-                                                                                                  
- tf.__operators__.getitem (Slic  (None, 768)         0           ['distilbert[0][0]']             
- ingOpLambda)                                                                                     
-                                                                                                  
- input_cat (InputLayer)         [(None, 3)]          0           []                               
-                                                                                                  
+ input_ids (InputLayer)         [(None, 150)]        0           []
+
+ distilbert (TFDistilBertMainLa  TFBaseModelOutput(l  66362880   ['input_ids[0][0]']
+ yer)                           ast_hidden_state=(N
+                                one, 150, 768),
+                                 hidden_states=None
+                                , attentions=None)
+
+ tf.__operators__.getitem (Slic  (None, 768)         0           ['distilbert[0][0]']
+ ingOpLambda)
+
+ input_cat (InputLayer)         [(None, 3)]          0           []
+
  pooled_output (Dropout)        (None, 768)          0           ['tf.__operators__.getitem[0][0]'
-                                                                 ]                                
-                                                                                                  
- dense (Dense)                  (None, 10)           40          ['input_cat[0][0]']              
-                                                                                                  
- concatenate (Concatenate)      (None, 778)          0           ['pooled_output[0][0]',          
-                                                                  'dense[0][0]']                  
-                                                                                                  
- output (Dense)                 (None, 44)           34276       ['concatenate[0][0]']            
-                                                                                                  
+                                                                 ]
+
+ dense (Dense)                  (None, 10)           40          ['input_cat[0][0]']
+
+ concatenate (Concatenate)      (None, 778)          0           ['pooled_output[0][0]',
+                                                                  'dense[0][0]']
+
+ output (Dense)                 (None, 44)           34276       ['concatenate[0][0]']
+
 ==================================================================================================
 Total params: 66,397,196
 Trainable params: 66,397,196

diff --git a/current_best_multilabel/v6_framework/svc.txt → current_best_model/v6_framework/svc.txt b/current_best_multilabel/v6_framework/svc.txt → current_best_model/v6_framework/svc.txt
@@ -1,6 +1,6 @@
 
  *****************
- Random state seed for train test split is: 42 
+ Random state seed for train test split is: 42
 
 
 Pipeline(steps=[('columntransformer',

diff --git a/...tilabel/v6_framework/v6_230724_qtype.xlsx → ...t_model/v6_framework/v6_230724_qtype.xlsx b/...tilabel/v6_framework/v6_230724_qtype.xlsx → ...t_model/v6_framework/v6_230724_qtype.xlsx
diff --git a/...ltilabel/v6_framework/v6_performance.xlsx → ...st_model/v6_framework/v6_performance.xlsx b/...ltilabel/v6_framework/v6_performance.xlsx → ...st_model/v6_framework/v6_performance.xlsx
diff --git a/current_best_multilabel/v6_framework/xgb.txt → current_best_model/v6_framework/xgb.txt b/current_best_multilabel/v6_framework/xgb.txt → current_best_model/v6_framework/xgb.txt
@@ -1,6 +1,6 @@
 
  *****************
- Random state seed for train test split is: 42 
+ Random state seed for train test split is: 42
 
 
 Pipeline(steps=[('columntransformer',

diff --git a/pxtextmining/factories/factory_pipeline.py b/pxtextmining/factories/factory_pipeline.py
@@ -7,7 +7,7 @@
 from sklearn.compose import make_column_transformer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.multioutput import MultiOutputClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.neighbors import KNeighborsClassifier
@@ -105,7 +105,7 @@ def create_sklearn_pipeline_sentiment(
                 cache_size=1000,
             ),
         )
-        params["svc__C"] = stats.uniform(0.1, 20)
+        params["svc__C"] = [1, 5, 10, 15, 20]
         params["svc__kernel"] = [
             "linear",
             "rbf",
@@ -350,7 +350,6 @@ def create_sklearn_pipeline(model_type, tokenizer=None, additional_features=True
                     max_iter=1000,
                     cache_size=1000,
                 ),
-                n_jobs=-1,
             ),
         )
         params["multioutputclassifier__estimator__C"] = [1, 5, 10, 15, 20]
@@ -359,6 +358,10 @@ def create_sklearn_pipeline(model_type, tokenizer=None, additional_features=True
             "rbf",
             "sigmoid",
         ]
+        if "columntransformer__tfidfvectorizer__min_df" in params:
+            params["columntransformer__tfidfvectorizer__min_df"] = [0, 1, 2, 3, 4, 5]
+        else:
+            params["tfidfvectorizer__min_df"] = [0, 1, 2, 3, 4, 5]
     if model_type == "rfc":
         pipe = make_pipeline(preproc, RandomForestClassifier(n_jobs=-1))
         params["randomforestclassifier__max_depth"] = stats.randint(5, 50)
@@ -418,16 +421,26 @@ def search_sklearn_pipelines(
                     model_type, additional_features=additional_features
                 )
             start_time = time.time()
-            search = RandomizedSearchCV(
-                pipe,
-                params,
-                scoring="average_precision",
-                n_iter=100,
-                cv=4,
-                n_jobs=-2,
-                refit=True,
-                verbose=1,
-            )
+            if model_type == "svm":
+                search = GridSearchCV(
+                    pipe,
+                    params,
+                    scoring="average_precision",
+                    cv=4,
+                    refit=True,
+                    verbose=1,
+                )
+            else:
+                search = RandomizedSearchCV(
+                    pipe,
+                    params,
+                    scoring="average_precision",
+                    n_iter=100,
+                    cv=4,
+                    n_jobs=-2,
+                    refit=True,
+                    verbose=1,
+                )
             search.fit(X_train, Y_train)
             models.append(search.best_estimator_)
             training_time = round(time.time() - start_time, 0)

diff --git a/pxtextmining/pipelines/multilabel_pipeline.py b/pxtextmining/pipelines/multilabel_pipeline.py
@@ -333,42 +333,42 @@ def run_bert_pipeline(
 
 
 if __name__ == "__main__":
-    run_svc_pipeline(
-        additional_features=False,
-        target=minor_cats,
-        path="test_multilabel/0906threshold/svc_noq",
-        include_analysis=True,
-        custom_threshold=True,
-    )
-    run_svc_pipeline(
-        additional_features=True,
-        target=minor_cats,
-        path="test_multilabel/0906threshold/svc",
-        include_analysis=True,
-        custom_threshold=True,
-    )
-    run_sklearn_pipeline(
-        additional_features=True,
-        target=minor_cats,
-        models_to_try=["xgb", "knn"],
-        path="test_multilabel/0906threshold/xgb",
-        include_analysis=True,
-        custom_threshold=True,
-    )
-    run_bert_pipeline(
-        additional_features=True,
-        path="test_multilabel/0906threshold/bert",
-        target=minor_cats,
-        include_analysis=True,
-        custom_threshold=True,
-    )
-    run_bert_pipeline(
-        additional_features=False,
-        path="test_multilabel/0906threshold/bert_noq",
-        target=minor_cats,
-        include_analysis=True,
-        custom_threshold=True,
-    )
+    # run_svc_pipeline(
+    #     additional_features=False,
+    #     target=minor_cats,
+    #     path="test_multilabel/0906threshold/svc_noq",
+    #     include_analysis=True,
+    #     custom_threshold=True,
+    # )
+    # run_svc_pipeline(
+    #     additional_features=True,
+    #     target=minor_cats,
+    #     path="test_multilabel/0906threshold/svc",
+    #     include_analysis=True,
+    #     custom_threshold=True,
+    # )
+    # run_sklearn_pipeline(
+    #     additional_features=True,
+    #     target=minor_cats,
+    #     models_to_try=["xgb", "knn"],
+    #     path="test_multilabel/0906threshold/xgb",
+    #     include_analysis=True,
+    #     custom_threshold=True,
+    # )
+    # run_bert_pipeline(
+    #     additional_features=True,
+    #     path="test_multilabel/0906threshold/bert",
+    #     target=minor_cats,
+    #     include_analysis=True,
+    #     custom_threshold=True,
+    # )
+    # run_bert_pipeline(
+    #     additional_features=False,
+    #     path="test_multilabel/0906threshold/bert_noq",
+    #     target=minor_cats,
+    #     include_analysis=True,
+    #     custom_threshold=True,
+    # )
     run_sklearn_pipeline(
         additional_features=True,
         target=minor_cats,

diff --git a/pxtextmining/pipelines/sentiment_pipeline.py b/pxtextmining/pipelines/sentiment_pipeline.py
@@ -1,6 +1,4 @@
-import random
 import numpy as np
-
 from sklearn.model_selection import train_test_split
 from sklearn.utils.class_weight import compute_class_weight
 from tensorflow.keras.utils import to_categorical
@@ -12,17 +10,17 @@
 )
 from pxtextmining.factories.factory_model_performance import get_multiclass_metrics
 from pxtextmining.factories.factory_pipeline import (
-    search_sklearn_pipelines,
+    create_bert_model,
     create_bert_model_additional_features,
+    search_sklearn_pipelines,
     train_bert_model,
-    create_bert_model,
 )
 from pxtextmining.factories.factory_write_results import (
     write_multilabel_models_and_metrics,
 )
 from pxtextmining.params import dataset
 
-random_state = random.randint(1, 999)
+random_state = 75
 
 
 def run_sentiment_pipeline(
@@ -134,5 +132,5 @@ def run_sentiment_bert_pipeline(
 if __name__ == "__main__":
     # run_sentiment_pipeline(additional_features=False)
     run_sentiment_bert_pipeline(
-        additional_features=True, path="test_multilabel/sentiment_bert"
+        additional_features=True, path="test_multilabel/230908_sentiment_bert"
     )
diff --git a/tests/test_factory_pipeline.py b/tests/test_factory_pipeline.py
@@ -67,9 +67,13 @@ def test_create_sklearn_pipeline(model_type, tokenizer, additional_features):
 
 @pytest.mark.parametrize("target", ["sentiment", None])
 @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
-def test_search_sklearn_pipelines(mock_search, target, grab_test_X_additional_feats):
+@patch("pxtextmining.factories.factory_pipeline.GridSearchCV")
+def test_search_sklearn_pipelines(
+    mock_gridsearch, mock_randomsearch, target, grab_test_X_additional_feats
+):
     mock_instance = MagicMock()
-    mock_search.return_value = mock_instance
+    mock_gridsearch.return_value = mock_instance
+    mock_randomsearch.return_value = mock_instance
     models_to_try = ["svm"]
     X_train = grab_test_X_additional_feats
     Y_train = np.array(
@@ -97,11 +101,13 @@ def test_search_sklearn_pipelines(mock_search, target, grab_test_X_additional_fe
 
 @pytest.mark.parametrize("target", ["sentiment", None])
 @patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
+@patch("pxtextmining.factories.factory_pipeline.GridSearchCV")
 def test_search_sklearn_pipelines_no_feats(
-    mock_search, target, grab_test_X_additional_feats
+    mock_gridsearch, mock_randomsearch, target, grab_test_X_additional_feats
 ):
     mock_instance = MagicMock()
-    mock_search.return_value = mock_instance
+    mock_gridsearch.return_value = mock_instance
+    mock_randomsearch.return_value = mock_instance
     models_to_try = ["svm"]
     X_train = grab_test_X_additional_feats["FFT answer"]
     Y_train = np.array(

diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py
@@ -187,7 +187,7 @@ def test_accuracy_per_class():
 
 
 def test_parse_metrics_file():
-    metrics_file = "current_best_multilabel/bert_sentiment.txt"
+    metrics_file = "current_best_model/sentiment/bert_sentiment.txt"
     labels = ["very positive", "positive", "neutral", "negative", "very negative"]
     metrics_df = factory_model_performance.parse_metrics_file(metrics_file, labels)
     assert metrics_df.shape == (5, 5)