Skip to content

Commit

Permalink
Merge pull request #139 from CDU-data-science-team/137_230908_models
Browse files Browse the repository at this point in the history
137 230908 models
  • Loading branch information
yiwen-h authored Sep 12, 2023
2 parents 6221306 + 95ec0f3 commit a4b7758
Show file tree
Hide file tree
Showing 18 changed files with 121 additions and 113 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

*****************
Random state seed for train test split is: 639
Random state seed for train test split is: 75


Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
Expand All @@ -14,15 +15,18 @@ ________________________________________________________________________________
hidden_states=None
, attentions=None)

input_cat (InputLayer) [(None, 1)] 0 []

tf.__operators__.getitem (Slic (None, 768) 0 ['distilbert[0][0]']
ingOpLambda)

input_cat (InputLayer) [(None, 3)] 0 []
category_encoding (CategoryEnc (None, 3) 0 ['input_cat[0][0]']
oding)

pooled_output (Dropout) (None, 768) 0 ['tf.__operators__.getitem[0][0]'
]

dense (Dense) (None, 10) 40 ['input_cat[0][0]']
dense (Dense) (None, 10) 40 ['category_encoding[0][0]']

concatenate (Concatenate) (None, 778) 0 ['pooled_output[0][0]',
'dense[0][0]']
Expand All @@ -33,34 +37,21 @@ ________________________________________________________________________________
Total params: 66,366,815
Trainable params: 66,366,815
Non-trainable params: 0
__________________________________________________________________________________________________


Training time: 4:02:31
Training time: 5:20:56


Classification report:
precision recall f1-score support

very positive 0.83 0.70 0.76 1253
positive 0.61 0.68 0.64 733
neutral 0.57 0.65 0.61 350
negative 0.68 0.84 0.75 377
very negative 0.73 0.46 0.56 104

accuracy 0.70 2817
macro avg 0.68 0.67 0.67 2817
weighted avg 0.72 0.70 0.70 2817


Combining 'very positive' and 'positive' together,
'negative' and 'very negative' together:

precision recall f1-score support

positive 0.96 0.91 0.94 1986
neutral 0.57 0.65 0.61 350
negative 0.79 0.87 0.83 481
very positive 0.80 0.79 0.80 1746
positive 0.63 0.52 0.57 841
neutral 0.52 0.71 0.60 551
negative 0.79 0.68 0.73 639
very negative 0.52 0.64 0.57 166

accuracy 0.87 2817
macro avg 0.77 0.81 0.79 2817
weighted avg 0.88 0.87 0.88 2817
accuracy 0.70 3943
macro avg 0.65 0.67 0.65 3943
weighted avg 0.71 0.70 0.70 3943
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

*****************
Random state seed for train test split is: 299
Random state seed for train test split is: 299


Pipeline(steps=[('columntransformer',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@

*****************
Random state seed for train test split is: 42
Random state seed for train test split is: 42


Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_ids (InputLayer) [(None, 150)] 0 []
distilbert (TFDistilBertMainLa TFBaseModelOutput(l 66362880 ['input_ids[0][0]']
yer) ast_hidden_state=(N
one, 150, 768),
hidden_states=None
, attentions=None)
tf.__operators__.getitem (Slic (None, 768) 0 ['distilbert[0][0]']
ingOpLambda)
input_cat (InputLayer) [(None, 3)] 0 []
input_ids (InputLayer) [(None, 150)] 0 []

distilbert (TFDistilBertMainLa TFBaseModelOutput(l 66362880 ['input_ids[0][0]']
yer) ast_hidden_state=(N
one, 150, 768),
hidden_states=None
, attentions=None)

tf.__operators__.getitem (Slic (None, 768) 0 ['distilbert[0][0]']
ingOpLambda)

input_cat (InputLayer) [(None, 3)] 0 []

pooled_output (Dropout) (None, 768) 0 ['tf.__operators__.getitem[0][0]'
]
dense (Dense) (None, 10) 40 ['input_cat[0][0]']
concatenate (Concatenate) (None, 778) 0 ['pooled_output[0][0]',
'dense[0][0]']
output (Dense) (None, 44) 34276 ['concatenate[0][0]']
]

dense (Dense) (None, 10) 40 ['input_cat[0][0]']

concatenate (Concatenate) (None, 778) 0 ['pooled_output[0][0]',
'dense[0][0]']

output (Dense) (None, 44) 34276 ['concatenate[0][0]']

==================================================================================================
Total params: 66,397,196
Trainable params: 66,397,196
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

*****************
Random state seed for train test split is: 42
Random state seed for train test split is: 42


Pipeline(steps=[('columntransformer',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

*****************
Random state seed for train test split is: 42
Random state seed for train test split is: 42


Pipeline(steps=[('columntransformer',
Expand Down
39 changes: 26 additions & 13 deletions pxtextmining/factories/factory_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
Expand Down Expand Up @@ -105,7 +105,7 @@ def create_sklearn_pipeline_sentiment(
cache_size=1000,
),
)
params["svc__C"] = stats.uniform(0.1, 20)
params["svc__C"] = [1, 5, 10, 15, 20]
params["svc__kernel"] = [
"linear",
"rbf",
Expand Down Expand Up @@ -350,7 +350,6 @@ def create_sklearn_pipeline(model_type, tokenizer=None, additional_features=True
max_iter=1000,
cache_size=1000,
),
n_jobs=-1,
),
)
params["multioutputclassifier__estimator__C"] = [1, 5, 10, 15, 20]
Expand All @@ -359,6 +358,10 @@ def create_sklearn_pipeline(model_type, tokenizer=None, additional_features=True
"rbf",
"sigmoid",
]
if "columntransformer__tfidfvectorizer__min_df" in params:
params["columntransformer__tfidfvectorizer__min_df"] = [0, 1, 2, 3, 4, 5]
else:
params["tfidfvectorizer__min_df"] = [0, 1, 2, 3, 4, 5]
if model_type == "rfc":
pipe = make_pipeline(preproc, RandomForestClassifier(n_jobs=-1))
params["randomforestclassifier__max_depth"] = stats.randint(5, 50)
Expand Down Expand Up @@ -418,16 +421,26 @@ def search_sklearn_pipelines(
model_type, additional_features=additional_features
)
start_time = time.time()
search = RandomizedSearchCV(
pipe,
params,
scoring="average_precision",
n_iter=100,
cv=4,
n_jobs=-2,
refit=True,
verbose=1,
)
if model_type == "svm":
search = GridSearchCV(
pipe,
params,
scoring="average_precision",
cv=4,
refit=True,
verbose=1,
)
else:
search = RandomizedSearchCV(
pipe,
params,
scoring="average_precision",
n_iter=100,
cv=4,
n_jobs=-2,
refit=True,
verbose=1,
)
search.fit(X_train, Y_train)
models.append(search.best_estimator_)
training_time = round(time.time() - start_time, 0)
Expand Down
72 changes: 36 additions & 36 deletions pxtextmining/pipelines/multilabel_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,42 +333,42 @@ def run_bert_pipeline(


if __name__ == "__main__":
run_svc_pipeline(
additional_features=False,
target=minor_cats,
path="test_multilabel/0906threshold/svc_noq",
include_analysis=True,
custom_threshold=True,
)
run_svc_pipeline(
additional_features=True,
target=minor_cats,
path="test_multilabel/0906threshold/svc",
include_analysis=True,
custom_threshold=True,
)
run_sklearn_pipeline(
additional_features=True,
target=minor_cats,
models_to_try=["xgb", "knn"],
path="test_multilabel/0906threshold/xgb",
include_analysis=True,
custom_threshold=True,
)
run_bert_pipeline(
additional_features=True,
path="test_multilabel/0906threshold/bert",
target=minor_cats,
include_analysis=True,
custom_threshold=True,
)
run_bert_pipeline(
additional_features=False,
path="test_multilabel/0906threshold/bert_noq",
target=minor_cats,
include_analysis=True,
custom_threshold=True,
)
# run_svc_pipeline(
# additional_features=False,
# target=minor_cats,
# path="test_multilabel/0906threshold/svc_noq",
# include_analysis=True,
# custom_threshold=True,
# )
# run_svc_pipeline(
# additional_features=True,
# target=minor_cats,
# path="test_multilabel/0906threshold/svc",
# include_analysis=True,
# custom_threshold=True,
# )
# run_sklearn_pipeline(
# additional_features=True,
# target=minor_cats,
# models_to_try=["xgb", "knn"],
# path="test_multilabel/0906threshold/xgb",
# include_analysis=True,
# custom_threshold=True,
# )
# run_bert_pipeline(
# additional_features=True,
# path="test_multilabel/0906threshold/bert",
# target=minor_cats,
# include_analysis=True,
# custom_threshold=True,
# )
# run_bert_pipeline(
# additional_features=False,
# path="test_multilabel/0906threshold/bert_noq",
# target=minor_cats,
# include_analysis=True,
# custom_threshold=True,
# )
run_sklearn_pipeline(
additional_features=True,
target=minor_cats,
Expand Down
10 changes: 4 additions & 6 deletions pxtextmining/pipelines/sentiment_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import random
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
Expand All @@ -12,17 +10,17 @@
)
from pxtextmining.factories.factory_model_performance import get_multiclass_metrics
from pxtextmining.factories.factory_pipeline import (
search_sklearn_pipelines,
create_bert_model,
create_bert_model_additional_features,
search_sklearn_pipelines,
train_bert_model,
create_bert_model,
)
from pxtextmining.factories.factory_write_results import (
write_multilabel_models_and_metrics,
)
from pxtextmining.params import dataset

random_state = random.randint(1, 999)
random_state = 75


def run_sentiment_pipeline(
Expand Down Expand Up @@ -134,5 +132,5 @@ def run_sentiment_bert_pipeline(
if __name__ == "__main__":
# run_sentiment_pipeline(additional_features=False)
run_sentiment_bert_pipeline(
additional_features=True, path="test_multilabel/sentiment_bert"
additional_features=True, path="test_multilabel/230908_sentiment_bert"
)
14 changes: 10 additions & 4 deletions tests/test_factory_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,13 @@ def test_create_sklearn_pipeline(model_type, tokenizer, additional_features):

@pytest.mark.parametrize("target", ["sentiment", None])
@patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
def test_search_sklearn_pipelines(mock_search, target, grab_test_X_additional_feats):
@patch("pxtextmining.factories.factory_pipeline.GridSearchCV")
def test_search_sklearn_pipelines(
mock_gridsearch, mock_randomsearch, target, grab_test_X_additional_feats
):
mock_instance = MagicMock()
mock_search.return_value = mock_instance
mock_gridsearch.return_value = mock_instance
mock_randomsearch.return_value = mock_instance
models_to_try = ["svm"]
X_train = grab_test_X_additional_feats
Y_train = np.array(
Expand Down Expand Up @@ -97,11 +101,13 @@ def test_search_sklearn_pipelines(mock_search, target, grab_test_X_additional_fe

@pytest.mark.parametrize("target", ["sentiment", None])
@patch("pxtextmining.factories.factory_pipeline.RandomizedSearchCV")
@patch("pxtextmining.factories.factory_pipeline.GridSearchCV")
def test_search_sklearn_pipelines_no_feats(
mock_search, target, grab_test_X_additional_feats
mock_gridsearch, mock_randomsearch, target, grab_test_X_additional_feats
):
mock_instance = MagicMock()
mock_search.return_value = mock_instance
mock_gridsearch.return_value = mock_instance
mock_randomsearch.return_value = mock_instance
models_to_try = ["svm"]
X_train = grab_test_X_additional_feats["FFT answer"]
Y_train = np.array(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_model_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_accuracy_per_class():


def test_parse_metrics_file():
metrics_file = "current_best_multilabel/bert_sentiment.txt"
metrics_file = "current_best_model/sentiment/bert_sentiment.txt"
labels = ["very positive", "positive", "neutral", "negative", "very negative"]
metrics_df = factory_model_performance.parse_metrics_file(metrics_file, labels)
assert metrics_df.shape == (5, 5)
Expand Down

0 comments on commit a4b7758

Please sign in to comment.