model_training.py

import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns
from plot_3d_graph import network_3d_plotly
import numpy as np
from tqdm import tqdm
import spacy
import hashlib
import time
import os
import shutil


def update_dataset(mode="tfidf"):
    def clear_directory(directory_path):
        # Check if the directory exists
        if os.path.exists(directory_path):
            # Remove the directory and all its contents
            shutil.rmtree(directory_path)
        # Recreate the empty directory
        os.makedirs(directory_path, exist_ok=True)

    def caption_tfidf(caption):
        doc = nlp(caption)
        tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
        preprocessed_caption = " ".join(tokens)
        return preprocessed_caption

    def caption_embeddings(caption):
        doc = nlp(caption)
        vectors = []
        for token in doc:
            if not token.is_stop and not token.is_punct and token.has_vector:
                vectors.append(nlp(token.lemma_).vector)
        if vectors:
            document_vector = np.mean(vectors, axis=0)
        else:
            document_vector = np.zeros((nlp.vocab.vectors_length,))

        hash_object = hashlib.sha256(caption.encode())
        hex_dig = hash_object.hexdigest()
        filename = f"vectors/embeddings/{hex_dig}_{int(time.time())}.npy"

        np.save(filename, document_vector)
        return filename

    def caption_transformers(caption):
        doc = nlp(caption)
        token_vectors = doc._.trf_data.last_hidden_layer_state.data  # last layer for most abstract representations

        document_vector = token_vectors.mean(axis=0)

        hash_object = hashlib.sha256(caption.encode())
        hex_dig = hash_object.hexdigest()
        filename = f"vectors/transformers/{hex_dig}_{int(time.time())}.npy"

        np.save(filename, document_vector)
        return filename

    with open("data/cover_captions.json") as f:
        captions = json.load(f)
    with open("data/finished_dataset.json") as f:
        data = json.load(f)
    if mode == "tfidf":
        nlp = spacy.load("en_core_web_md")
    elif mode == "embeddings":
        save_path = "vectors/embeddings"
        clear_directory(save_path)
        nlp = spacy.load('en_core_web_md')
    elif mode == "transformer":
        save_path = "vectors/transformers"
        clear_directory(save_path)
        nlp = spacy.load("en_core_web_trf")
    else:
        raise ValueError("Invalid mode. Use 'tfidf', 'embeddings', or 'transformer'.")
    combined_data = []
    for d in tqdm(data):
        for c in captions:
            if d['metadata']['album'] == c['album']:
                if mode == "tfidf":
                    processed_text = caption_tfidf(
                        c['processed'].replace("Describe the image in great detail: ", "")
                    )
                elif mode == "embeddings":
                    processed_text = caption_embeddings(
                        c['processed'].replace("Describe the image in great detail: ", "")
                    )
                elif mode == "transformer":
                    processed_text = caption_transformers(
                        c['processed'].replace("Describe the image in great detail: ", "")
                    )
                else:
                    raise ValueError("Invalid mode. Use 'tfidf', 'embeddings', or 'transformer'.")
                combined_dict = {
                    'moods': d['moods'],
                    'metadata': d['metadata'],
                    'caption': processed_text
                }
                combined_data.append(combined_dict)
    with open(f"data/combined_dataset_{mode}.json", "w") as f:
        json.dump(combined_data, f)


def train_model(vector_type="tfidf", model="rf"):
    def calculate_roc():
        classes = np.unique(np.concatenate((y, y_pred)))

        y_bin = label_binarize(y, classes=np.unique(y))
        n_classes = y_bin.shape[1]
        y_pred_probs = cross_val_predict(clf, X, y, cv=cv, method='predict_proba')

        sns.set_theme()
        sns.set_context("paper")

        tab20b_cmap = plt.get_cmap('tab20b')
        tab20c_cmap = plt.get_cmap('tab20c')

        tab20b_indices = np.linspace(0, 1, 20)
        tab20c_indices = np.linspace(0, 1, 20)[:4]

        tab20b_colors = tab20b_cmap(tab20b_indices)
        tab20c_colors = tab20c_cmap(tab20c_indices)

        color_palette = np.vstack((tab20b_colors, tab20c_colors))
        sns.set_palette(color_palette)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_pred_probs[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Plot all ROC curves
        plt.figure(figsize=(9, 9))
        for i, color, label in zip(range(n_classes), color_palette, classes):
            plt.plot(fpr[i], tpr[i], color=color, lw=2,
                     label='{0} (area = {1:0.2f})'.format(label, roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('')
        plt.legend(loc="lower right")
        plt.savefig(f"results/{model}_roc_curve_{vector_type}.pdf")
        plt.clf()

        precision = dict()
        recall = dict()
        average_precision = dict()
        for i in range(n_classes):
            precision[i], recall[i], _ = precision_recall_curve(y_bin[:, i], y_pred_probs[:, i])
            average_precision[i] = average_precision_score(y_bin[:, i], y_pred_probs[:, i])

        plt.figure(figsize=(9, 9))
        for i, color, label in zip(range(n_classes), color_palette, classes):
            plt.plot(recall[i], precision[i], color=color, lw=2,
                     label='{0} (AP = {1:0.2f})'.format(label, average_precision[i]))

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('')
        plt.legend(loc="upper right", fontsize='small')
        plt.savefig(f"results/{model}_precision_recall_curve_{vector_type}.pdf")
        plt.close()

    with open(f"data/combined_dataset_{vector_type}.json") as f:
        data = json.load(f)

    if vector_type == "tfidf":
        df = pd.DataFrame(
            [{'subgenre': item['metadata']['subgenre_name'], "caption": item['caption']} for item in data]
        )

        vectorizer = TfidfVectorizer()

        # Fit and transform the preprocessed captions to TF-IDF vectors
        X = vectorizer.fit_transform(df['caption'])
        y = df['subgenre']

        if model == "rf":
            clf = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model == "svm":
            clf = SVC(probability=True, random_state=42)
        else:
            raise ValueError("Invalid model. Use 'rf' or 'svm'.")

        # Define a KFold strategy
        cv = KFold(n_splits=5, shuffle=True, random_state=42)

        # Use cross_val_predict to make predictions on each test fold
        y_pred = cross_val_predict(clf, X, y, cv=cv)

        calculate_roc()
    elif vector_type == "embeddings" or vector_type == "transformer":
        X_vectors = []
        y_labels = []

        for item in data:
            # Load the vector from the file path specified in your item
            vector_path = item['caption']  # This should be the full path to the .npy file
            vector = np.load(vector_path)

            X_vectors.append(vector)
            y_labels.append(item['metadata']['subgenre_name'])

        # Convert the list of vectors into a numpy array
        X = np.array(X_vectors)
        y = np.array(y_labels)

        # Initialize the Random Forest classifier
        if model == "rf":
            clf = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model == "svm":
            clf = SVC(probability=True, random_state=42)
        else:
            raise ValueError("Invalid model. Use 'rf' or 'svm'.")

        # Define a KFold strategy
        cv = KFold(n_splits=5, shuffle=True, random_state=42)

        # Use cross_val_predict to make predictions on each test fold
        y_pred = cross_val_predict(clf, X, y, cv=cv)

        calculate_roc()
    else:
        raise ValueError("Invalid vector type. Use 'tfidf', 'embeddings', or 'transformer'.")

    conf_matrix = confusion_matrix(y, y_pred)
    np.save('data/confusion_matrix.npy', conf_matrix)
    true_positives = np.diag(conf_matrix)
    ratio_matrix = np.zeros_like(conf_matrix, dtype=float)

    for i in range(conf_matrix.shape[0]):  # Iterate over rows (actual classes)
        for j in range(conf_matrix.shape[1]):  # Iterate over columns (predicted classes)
            if i != j:  # Skip diagonal (true positives)
                if true_positives[i] > 0 and true_positives[j] > 0:
                    ratio_i_to_j = conf_matrix[i, j] / true_positives[i]
                    ratio_j_to_i = conf_matrix[j, i] / true_positives[j]
                    ratio_matrix[i, j] = (ratio_i_to_j + ratio_j_to_i) / 2
                else:
                    # If either true positive count is 0, handle accordingly. Here, we simply set it to 0.
                    ratio_matrix[i, j] = 0
            else:
                # Optionally handle diagonal elements differently since they represent true positives, not confusion.
                # For the purpose of proximity calculation, these can remain 0 or be set to a specific value if desired.
                ratio_matrix[i, j] = 1

    annot_matrix = np.empty(ratio_matrix.shape, dtype=object)
    for i in range(ratio_matrix.shape[0]):  # Iterate over rows
        for j in range(ratio_matrix.shape[1]):  # Iterate over columns
            if ratio_matrix[i, j] == 1:
                annot_matrix[i, j] = ""  # Set to empty string for values of 1
            else:
                # Format values to remove leading zero if less than 1 but not zero, keep as is otherwise
                annot_value = ratio_matrix[i, j]
                if annot_value == 0:
                    annot_matrix[i, j] = "0"
                else:
                    annot_matrix[i, j] = f"{ratio_matrix[i, j]:.2f}".lstrip('0')

    report = classification_report(y, y_pred, output_dict=True)
    with open(f"results/{model}_report_subgenre_{vector_type}.json", "w") as f:
        json.dump(report, f, indent=4, sort_keys=True, ensure_ascii=False)

    with open("data/genre_mapping.json") as f:
        genre_mapping = json.load(f)

    # Ensure class_labels are sorted uniquely to maintain consistent order
    class_labels = np.unique(np.concatenate((y, y_pred)))
    # Map each unique subgenre label to its corresponding top-level genre
    subgenre_to_top_genre = [genre_mapping[label] for label in class_labels]
    # Create a unique list of top-level genres in the order they appear
    top_genres = sorted(set(subgenre_to_top_genre), key=subgenre_to_top_genre.index)
    # Create a mapping from top-level genres to a new index
    top_genre_to_new_index = {genre: i for i, genre in enumerate(top_genres)}
    # Initialize an empty confusion matrix for top-level genres
    top_level_conf_matrix = np.zeros((len(top_genres), len(top_genres)), dtype=int)
    # Aggregate the confusion matrix values from subgenres to top-level genres
    for i, row_label in enumerate(class_labels):
        for j, col_label in enumerate(class_labels):
            # Map the subgenre indices to top-level genre indices
            top_i = top_genre_to_new_index[genre_mapping[row_label]]
            top_j = top_genre_to_new_index[genre_mapping[col_label]]
            # Aggregate the values
            top_level_conf_matrix[top_i, top_j] += conf_matrix[i, j]
    true_positives_genre = np.diag(top_level_conf_matrix)
    ratio_genre_matrix = np.zeros_like(top_level_conf_matrix, dtype=float)

    for i in range(top_level_conf_matrix.shape[0]):  # Iterate over rows (actual classes)
        for j in range(top_level_conf_matrix.shape[1]):  # Iterate over columns (predicted classes)
            if i != j:  # Skip diagonal (true positives)
                if true_positives_genre[i] > 0 and true_positives_genre[j] > 0:
                    ratio_i_to_j = top_level_conf_matrix[i, j] / true_positives_genre[i]
                    ratio_j_to_i = top_level_conf_matrix[j, i] / true_positives_genre[j]
                    ratio_genre_matrix[i, j] = (ratio_i_to_j + ratio_j_to_i) / 2
                else:
                    # If either true positive count is 0, handle accordingly. Here, we simply set it to 0.
                    ratio_genre_matrix[i, j] = 0
            else:
                # Optionally handle diagonal elements differently since they represent true positives.
                # For the purpose of proximity calculation, these can remain 0 or be set to a specific value if desired.
                ratio_genre_matrix[i, j] = 1

    annot_genre_matrix = np.empty(ratio_genre_matrix.shape, dtype=object)
    for i in range(ratio_genre_matrix.shape[0]):  # Iterate over rows
        for j in range(ratio_genre_matrix.shape[1]):  # Iterate over columns
            if ratio_genre_matrix[i, j] == 1:
                annot_genre_matrix[i, j] = ""  # Set to empty string for values of 1
            else:
                # Format values to remove leading zero if less than 1 but not zero, keep as is otherwise
                annot_genre_value = ratio_genre_matrix[i, j]
                if annot_genre_value == 0:
                    annot_genre_matrix[i, j] = "0"
                else:
                    annot_genre_matrix[i, j] = f"{ratio_genre_matrix[i, j]:.2f}".lstrip('0')

    network_3d_plotly(conf_matrix, genre_mapping, class_labels, vector_type, model)

    # Convert y and y_pred to pandas Series for easy mapping
    y_series = pd.Series(y)
    y_pred_series = pd.Series(y_pred)

    # Map subgenre labels to top-level genre labels
    y_top = y_series.map(genre_mapping)
    y_pred_top = y_pred_series.map(genre_mapping)

    # Generate a classification report
    report = classification_report(y_top, y_pred_top, output_dict=True)
    with open(f"results/{model}_report_genre_{vector_type}.json", "w") as f:
        json.dump(report, f, indent=4, sort_keys=True, ensure_ascii=False)

    # Optional: Plot the confusion matrix for better visualization
    matrices = [
        {
            "data": conf_matrix,
            "title": "Subgenre Confusion Matrix",
            "file_name": f"results/{model}_subgenre_confusion_matrix_{vector_type}.pdf",
            "formating": "d",
            "annotations": True,
            "labels": class_labels,
            "adjust": (0.16, 0.95, 0.95, 0.22)
         },
        {
            "data": ratio_matrix,
            "title": "Subgenre Ratio Matrix",
            "file_name": f"results/{model}_subgenre_ratio_matrix_{vector_type}.pdf",
            "formating": "",
            "annotations": annot_matrix,
            "labels": class_labels,
            "adjust": (0.16, 0.95, 0.95, 0.22)
        },
        {
            "data": top_level_conf_matrix,
            "title": "Genre Confusion Matrix",
            "file_name": f"results/{model}_genre_confusion_matrix_{vector_type}.pdf",
            "formating": "d",
            "annotations": True,
            "labels": top_genres,
            "adjust": (0.125, 0.9, 0.9, 0.1)
        },
        {
            "data": ratio_genre_matrix,
            "title": "Subgenre Ratio Matrix",
            "file_name": f"results/{model}_genre_ratio_matrix_{vector_type}.pdf",
            "formating": "",
            "annotations": annot_genre_matrix,
            "labels": top_genres,
            "adjust": (0.125, 0.9, 0.9, 0.1)
        }
    ]
    for m in matrices:
        plt.figure(figsize=(10, 7))
        sns.heatmap(m["data"], annot=m["annotations"], fmt=m["formating"], cmap="Blues",
                    xticklabels=m["labels"], yticklabels=m["labels"])
        plt.title("")
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.subplots_adjust(left=m["adjust"][0], right=m["adjust"][1], top=m["adjust"][2], bottom=m["adjust"][3])
        plt.savefig(m["file_name"])
        plt.close()


models = ["rf", "svm"]
modes = ["tfidf", "embeddings", "transformer"]
bar_length = len(models) * len(modes)

with tqdm(total=bar_length) as pbar:
    for mod in models:
        for vec in modes:
            train_model(vector_type=vec, model=mod)
            pbar.update(1)