From 54a79cab38e862f9f96467178b538851488b406a Mon Sep 17 00:00:00 2001 From: Hinome <57831472+RealHinome@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:18:56 +0200 Subject: [PATCH] Add ML notebook --- models/HTML_Verificator.ipynb | 396 ++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 models/HTML_Verificator.ipynb diff --git a/models/HTML_Verificator.ipynb b/models/HTML_Verificator.ipynb new file mode 100644 index 0000000..f319a63 --- /dev/null +++ b/models/HTML_Verificator.ipynb @@ -0,0 +1,396 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Phishing website detection based on HTML content\n", + "> We are using a basic text classifier (RNN) to determine if the website is a phishing one, or legitm one.\n" + ], + "metadata": { + "id": "TK3Htc334KFg" + } + }, + { + "cell_type": "markdown", + "source": [ + "Import required modules" + ], + "metadata": { + "id": "UVkHdQ1m-XcS" + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "neTAE4jjyO7y" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "import tensorflow_datasets as tfds\n", + "import tensorflow as tf\n", + "\n", + "tf.get_logger().setLevel('ERROR')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Set basic variables" + ], + "metadata": { + "id": "9jQSROvu_JeE" + } + }, + { + "cell_type": "code", + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "batch_size = 64\n", + "seed = 42" + ], + "metadata": { + "id": "ehg828Gb_I-2" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Set dataset" + ], + "metadata": { + "id": "RGIa2byQ-fyg" + } + }, + { + "cell_type": "code", + "source": [ + "train_dataset = tf.keras.utils.text_dataset_from_directory(\n", + " 'drive/MyDrive/is_ai',\n", + " batch_size=batch_size,\n", + " validation_split=0.2,\n", + " subset='training',\n", + " seed=seed)\n", + "\n", + "class_names = train_dataset.class_names\n", + "train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)\n", + "\n", + "test_dataset = tf.keras.utils.text_dataset_from_directory(\n", + " 'drive/MyDrive/is_ai',\n", + " batch_size=batch_size,\n", + " validation_split=0.2,\n", + " subset='validation',\n", + " seed=seed)\n", + "\n", + "test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)" + ], + "metadata": { + "id": "vCu3Mlth-StS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "for example, label in train_dataset.take(1):\n", + " print('text: ', example.numpy()[:2])\n", + " print('label: ', label.numpy()[:2])" + ], + "metadata": { + "id": "54KG28zBACtD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Create the text encoder" + ], + "metadata": { + "id": "NpEl6ELfAtaM" + } + }, + { + "cell_type": "code", + "source": [ + "VOCAB_SIZE = 1000\n", + "encoder = tf.keras.layers.TextVectorization(\n", + " max_tokens=VOCAB_SIZE)\n", + "encoder.adapt(train_dataset.map(lambda text, label: text))" + ], + "metadata": { + "id": "XZ9M9KRLAs1R" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Create model" + ], + "metadata": { + "id": "4isCe5e-A8gE" + } + }, + { + "cell_type": "code", + "source": [ + "model = tf.keras.Sequential([\n", + " encoder,\n", + " tf.keras.layers.Embedding(\n", + " input_dim=len(encoder.get_vocabulary()),\n", + " output_dim=64,\n", + " # Use masking to handle the variable sequence lengths\n", + " mask_zero=True),\n", + " tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dense(1)\n", + "])" + ], + "metadata": { + "id": "YbUo_DGDA64b" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Compile model to start training" + ], + "metadata": { + "id": "EsyuV54LBIgj" + } + }, + { + "cell_type": "code", + "source": [ + "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer=tf.keras.optimizers.Adam(1e-4),\n", + " metrics=['accuracy'])" + ], + "metadata": { + "id": "3r6kKxy2BGTo" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Train model" + ], + "metadata": { + "id": "17cC9GQCBMoT" + } + }, + { + "cell_type": "code", + "source": [ + "history = model.fit(train_dataset, epochs=10,\n", + " validation_data=test_dataset,\n", + " validation_steps=30)" + ], + "metadata": { + "id": "XJmqmLuNBN_B" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "test_loss, test_acc = model.evaluate(test_dataset)\n", + "\n", + "print('Test Loss:', test_loss)\n", + "print('Test Accuracy:', test_acc)" + ], + "metadata": { + "id": "_dcLkaISB80u" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Stack second model" + ], + "metadata": { + "id": "InWdPNuHCDtN" + } + }, + { + "cell_type": "code", + "source": [ + "model = tf.keras.Sequential([\n", + " encoder,\n", + " tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),\n", + " tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),\n", + " tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dropout(0.5),\n", + " tf.keras.layers.Dense(1)\n", + "])" + ], + "metadata": { + "id": "X3iysmQyCDJS" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer=tf.keras.optimizers.Adam(1e-4),\n", + " metrics=['accuracy'])" + ], + "metadata": { + "id": "RmFPpid7CFSO" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "history = model.fit(train_dataset, epochs=10,\n", + " validation_data=test_dataset,\n", + " validation_steps=30)" + ], + "metadata": { + "id": "O7ca3jIACGXw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "test_loss, test_acc = model.evaluate(test_dataset)\n", + "\n", + "print('Test Loss:', test_loss)\n", + "print('Test Accuracy:', test_acc)" + ], + "metadata": { + "id": "TZGpbPgeClPg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Use model" + ], + "metadata": { + "id": "5Dzi4Au7CtdZ" + } + }, + { + "cell_type": "code", + "source": [ + "text_text = \"\"\n", + "\n", + "for text, _ in train_dataset.take(1):\n", + " text_text = text[:1]" + ], + "metadata": { + "id": "JaVwb1DVFegW" + }, + "execution_count": 32, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictions = model.predict(text_text)\n", + "print(predictions)" + ], + "metadata": { + "id": "DkqLwoyBCqfp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.save(\"phishing\")" + ], + "metadata": { + "id": "cBT4oyUXDI5l" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Use model" + ], + "metadata": { + "id": "AyuilBw8ESAX" + } + }, + { + "cell_type": "code", + "source": [ + "new_model = tf.keras.models.load_model(\"phishing\")" + ], + "metadata": { + "id": "AjxUyjudE1Ju" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictions = new_model.predict(text_text)\n", + "print(predictions)" + ], + "metadata": { + "id": "oIm9OPBGERbv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "test_loss, test_acc = new_model.evaluate(test_dataset)\n", + "\n", + "print('Test Loss:', test_loss)\n", + "print('Test Accuracy:', test_acc)" + ], + "metadata": { + "id": "SWaWQJpOEyIG" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file