From 54a79cab38e862f9f96467178b538851488b406a Mon Sep 17 00:00:00 2001
From: Hinome <57831472+RealHinome@users.noreply.github.com>
Date: Thu, 10 Aug 2023 22:18:56 +0200
Subject: [PATCH] Add ML notebook

---
 models/HTML_Verificator.ipynb | 396 ++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 models/HTML_Verificator.ipynb

diff --git a/models/HTML_Verificator.ipynb b/models/HTML_Verificator.ipynb
new file mode 100644
index 0000000..f319a63
--- /dev/null
+++ b/models/HTML_Verificator.ipynb
@@ -0,0 +1,396 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Phishing website detection based on HTML content\n",
+        "> We are using a basic text classifier (RNN) to determine if the website is a phishing one, or legitm one.\n"
+      ],
+      "metadata": {
+        "id": "TK3Htc334KFg"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Import required modules"
+      ],
+      "metadata": {
+        "id": "UVkHdQ1m-XcS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "neTAE4jjyO7y"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "import tensorflow_datasets as tfds\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "tf.get_logger().setLevel('ERROR')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Set basic variables"
+      ],
+      "metadata": {
+        "id": "9jQSROvu_JeE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "AUTOTUNE = tf.data.AUTOTUNE\n",
+        "batch_size = 64\n",
+        "seed = 42"
+      ],
+      "metadata": {
+        "id": "ehg828Gb_I-2"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Set dataset"
+      ],
+      "metadata": {
+        "id": "RGIa2byQ-fyg"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_dataset = tf.keras.utils.text_dataset_from_directory(\n",
+        "    'drive/MyDrive/is_ai',\n",
+        "    batch_size=batch_size,\n",
+        "    validation_split=0.2,\n",
+        "    subset='training',\n",
+        "    seed=seed)\n",
+        "\n",
+        "class_names = train_dataset.class_names\n",
+        "train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)\n",
+        "\n",
+        "test_dataset = tf.keras.utils.text_dataset_from_directory(\n",
+        "    'drive/MyDrive/is_ai',\n",
+        "    batch_size=batch_size,\n",
+        "    validation_split=0.2,\n",
+        "    subset='validation',\n",
+        "    seed=seed)\n",
+        "\n",
+        "test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)"
+      ],
+      "metadata": {
+        "id": "vCu3Mlth-StS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "for example, label in train_dataset.take(1):\n",
+        "  print('text: ', example.numpy()[:2])\n",
+        "  print('label: ', label.numpy()[:2])"
+      ],
+      "metadata": {
+        "id": "54KG28zBACtD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Create the text encoder"
+      ],
+      "metadata": {
+        "id": "NpEl6ELfAtaM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "VOCAB_SIZE = 1000\n",
+        "encoder = tf.keras.layers.TextVectorization(\n",
+        "    max_tokens=VOCAB_SIZE)\n",
+        "encoder.adapt(train_dataset.map(lambda text, label: text))"
+      ],
+      "metadata": {
+        "id": "XZ9M9KRLAs1R"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Create model"
+      ],
+      "metadata": {
+        "id": "4isCe5e-A8gE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = tf.keras.Sequential([\n",
+        "    encoder,\n",
+        "    tf.keras.layers.Embedding(\n",
+        "        input_dim=len(encoder.get_vocabulary()),\n",
+        "        output_dim=64,\n",
+        "        # Use masking to handle the variable sequence lengths\n",
+        "        mask_zero=True),\n",
+        "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n",
+        "    tf.keras.layers.Dense(64, activation='relu'),\n",
+        "    tf.keras.layers.Dense(1)\n",
+        "])"
+      ],
+      "metadata": {
+        "id": "YbUo_DGDA64b"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Compile model to start training"
+      ],
+      "metadata": {
+        "id": "EsyuV54LBIgj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "              optimizer=tf.keras.optimizers.Adam(1e-4),\n",
+        "              metrics=['accuracy'])"
+      ],
+      "metadata": {
+        "id": "3r6kKxy2BGTo"
+      },
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Train model"
+      ],
+      "metadata": {
+        "id": "17cC9GQCBMoT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "history = model.fit(train_dataset, epochs=10,\n",
+        "                    validation_data=test_dataset,\n",
+        "                    validation_steps=30)"
+      ],
+      "metadata": {
+        "id": "XJmqmLuNBN_B"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_loss, test_acc = model.evaluate(test_dataset)\n",
+        "\n",
+        "print('Test Loss:', test_loss)\n",
+        "print('Test Accuracy:', test_acc)"
+      ],
+      "metadata": {
+        "id": "_dcLkaISB80u"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Stack second model"
+      ],
+      "metadata": {
+        "id": "InWdPNuHCDtN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = tf.keras.Sequential([\n",
+        "    encoder,\n",
+        "    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),\n",
+        "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),\n",
+        "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n",
+        "    tf.keras.layers.Dense(64, activation='relu'),\n",
+        "    tf.keras.layers.Dropout(0.5),\n",
+        "    tf.keras.layers.Dense(1)\n",
+        "])"
+      ],
+      "metadata": {
+        "id": "X3iysmQyCDJS"
+      },
+      "execution_count": 11,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "              optimizer=tf.keras.optimizers.Adam(1e-4),\n",
+        "              metrics=['accuracy'])"
+      ],
+      "metadata": {
+        "id": "RmFPpid7CFSO"
+      },
+      "execution_count": 12,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "history = model.fit(train_dataset, epochs=10,\n",
+        "                    validation_data=test_dataset,\n",
+        "                    validation_steps=30)"
+      ],
+      "metadata": {
+        "id": "O7ca3jIACGXw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_loss, test_acc = model.evaluate(test_dataset)\n",
+        "\n",
+        "print('Test Loss:', test_loss)\n",
+        "print('Test Accuracy:', test_acc)"
+      ],
+      "metadata": {
+        "id": "TZGpbPgeClPg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Use model"
+      ],
+      "metadata": {
+        "id": "5Dzi4Au7CtdZ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "text_text = \"\"\n",
+        "\n",
+        "for text, _ in train_dataset.take(1):\n",
+        "    text_text = text[:1]"
+      ],
+      "metadata": {
+        "id": "JaVwb1DVFegW"
+      },
+      "execution_count": 32,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "predictions = model.predict(text_text)\n",
+        "print(predictions)"
+      ],
+      "metadata": {
+        "id": "DkqLwoyBCqfp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.save(\"phishing\")"
+      ],
+      "metadata": {
+        "id": "cBT4oyUXDI5l"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Use model"
+      ],
+      "metadata": {
+        "id": "AyuilBw8ESAX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "new_model = tf.keras.models.load_model(\"phishing\")"
+      ],
+      "metadata": {
+        "id": "AjxUyjudE1Ju"
+      },
+      "execution_count": 21,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "predictions = new_model.predict(text_text)\n",
+        "print(predictions)"
+      ],
+      "metadata": {
+        "id": "oIm9OPBGERbv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_loss, test_acc = new_model.evaluate(test_dataset)\n",
+        "\n",
+        "print('Test Loss:', test_loss)\n",
+        "print('Test Accuracy:', test_acc)"
+      ],
+      "metadata": {
+        "id": "SWaWQJpOEyIG"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file