This repository has been archived by the owner on Aug 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
685c5ee
commit 54a79ca
Showing
1 changed file
with
396 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,396 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Phishing website detection based on HTML content\n", | ||
"> We are using a basic text classifier (RNN) to determine if the website is a phishing one, or legitm one.\n" | ||
], | ||
"metadata": { | ||
"id": "TK3Htc334KFg" | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Import required modules" | ||
], | ||
"metadata": { | ||
"id": "UVkHdQ1m-XcS" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"id": "neTAE4jjyO7y" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"\n", | ||
"import tensorflow_datasets as tfds\n", | ||
"import tensorflow as tf\n", | ||
"\n", | ||
"tf.get_logger().setLevel('ERROR')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Set basic variables" | ||
], | ||
"metadata": { | ||
"id": "9jQSROvu_JeE" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"AUTOTUNE = tf.data.AUTOTUNE\n", | ||
"batch_size = 64\n", | ||
"seed = 42" | ||
], | ||
"metadata": { | ||
"id": "ehg828Gb_I-2" | ||
}, | ||
"execution_count": 3, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Set dataset" | ||
], | ||
"metadata": { | ||
"id": "RGIa2byQ-fyg" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"train_dataset = tf.keras.utils.text_dataset_from_directory(\n", | ||
" 'drive/MyDrive/is_ai',\n", | ||
" batch_size=batch_size,\n", | ||
" validation_split=0.2,\n", | ||
" subset='training',\n", | ||
" seed=seed)\n", | ||
"\n", | ||
"class_names = train_dataset.class_names\n", | ||
"train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)\n", | ||
"\n", | ||
"test_dataset = tf.keras.utils.text_dataset_from_directory(\n", | ||
" 'drive/MyDrive/is_ai',\n", | ||
" batch_size=batch_size,\n", | ||
" validation_split=0.2,\n", | ||
" subset='validation',\n", | ||
" seed=seed)\n", | ||
"\n", | ||
"test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)" | ||
], | ||
"metadata": { | ||
"id": "vCu3Mlth-StS" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"for example, label in train_dataset.take(1):\n", | ||
" print('text: ', example.numpy()[:2])\n", | ||
" print('label: ', label.numpy()[:2])" | ||
], | ||
"metadata": { | ||
"id": "54KG28zBACtD" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Create the text encoder" | ||
], | ||
"metadata": { | ||
"id": "NpEl6ELfAtaM" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"VOCAB_SIZE = 1000\n", | ||
"encoder = tf.keras.layers.TextVectorization(\n", | ||
" max_tokens=VOCAB_SIZE)\n", | ||
"encoder.adapt(train_dataset.map(lambda text, label: text))" | ||
], | ||
"metadata": { | ||
"id": "XZ9M9KRLAs1R" | ||
}, | ||
"execution_count": 6, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Create model" | ||
], | ||
"metadata": { | ||
"id": "4isCe5e-A8gE" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"model = tf.keras.Sequential([\n", | ||
" encoder,\n", | ||
" tf.keras.layers.Embedding(\n", | ||
" input_dim=len(encoder.get_vocabulary()),\n", | ||
" output_dim=64,\n", | ||
" # Use masking to handle the variable sequence lengths\n", | ||
" mask_zero=True),\n", | ||
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n", | ||
" tf.keras.layers.Dense(64, activation='relu'),\n", | ||
" tf.keras.layers.Dense(1)\n", | ||
"])" | ||
], | ||
"metadata": { | ||
"id": "YbUo_DGDA64b" | ||
}, | ||
"execution_count": 7, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Compile model to start training" | ||
], | ||
"metadata": { | ||
"id": "EsyuV54LBIgj" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n", | ||
" optimizer=tf.keras.optimizers.Adam(1e-4),\n", | ||
" metrics=['accuracy'])" | ||
], | ||
"metadata": { | ||
"id": "3r6kKxy2BGTo" | ||
}, | ||
"execution_count": 8, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Train model" | ||
], | ||
"metadata": { | ||
"id": "17cC9GQCBMoT" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"history = model.fit(train_dataset, epochs=10,\n", | ||
" validation_data=test_dataset,\n", | ||
" validation_steps=30)" | ||
], | ||
"metadata": { | ||
"id": "XJmqmLuNBN_B" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"test_loss, test_acc = model.evaluate(test_dataset)\n", | ||
"\n", | ||
"print('Test Loss:', test_loss)\n", | ||
"print('Test Accuracy:', test_acc)" | ||
], | ||
"metadata": { | ||
"id": "_dcLkaISB80u" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Stack second model" | ||
], | ||
"metadata": { | ||
"id": "InWdPNuHCDtN" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"model = tf.keras.Sequential([\n", | ||
" encoder,\n", | ||
" tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),\n", | ||
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),\n", | ||
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n", | ||
" tf.keras.layers.Dense(64, activation='relu'),\n", | ||
" tf.keras.layers.Dropout(0.5),\n", | ||
" tf.keras.layers.Dense(1)\n", | ||
"])" | ||
], | ||
"metadata": { | ||
"id": "X3iysmQyCDJS" | ||
}, | ||
"execution_count": 11, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n", | ||
" optimizer=tf.keras.optimizers.Adam(1e-4),\n", | ||
" metrics=['accuracy'])" | ||
], | ||
"metadata": { | ||
"id": "RmFPpid7CFSO" | ||
}, | ||
"execution_count": 12, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"history = model.fit(train_dataset, epochs=10,\n", | ||
" validation_data=test_dataset,\n", | ||
" validation_steps=30)" | ||
], | ||
"metadata": { | ||
"id": "O7ca3jIACGXw" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"test_loss, test_acc = model.evaluate(test_dataset)\n", | ||
"\n", | ||
"print('Test Loss:', test_loss)\n", | ||
"print('Test Accuracy:', test_acc)" | ||
], | ||
"metadata": { | ||
"id": "TZGpbPgeClPg" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Use model" | ||
], | ||
"metadata": { | ||
"id": "5Dzi4Au7CtdZ" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"text_text = \"\"\n", | ||
"\n", | ||
"for text, _ in train_dataset.take(1):\n", | ||
" text_text = text[:1]" | ||
], | ||
"metadata": { | ||
"id": "JaVwb1DVFegW" | ||
}, | ||
"execution_count": 32, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"predictions = model.predict(text_text)\n", | ||
"print(predictions)" | ||
], | ||
"metadata": { | ||
"id": "DkqLwoyBCqfp" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"model.save(\"phishing\")" | ||
], | ||
"metadata": { | ||
"id": "cBT4oyUXDI5l" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Use model" | ||
], | ||
"metadata": { | ||
"id": "AyuilBw8ESAX" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"new_model = tf.keras.models.load_model(\"phishing\")" | ||
], | ||
"metadata": { | ||
"id": "AjxUyjudE1Ju" | ||
}, | ||
"execution_count": 21, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"predictions = new_model.predict(text_text)\n", | ||
"print(predictions)" | ||
], | ||
"metadata": { | ||
"id": "oIm9OPBGERbv" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"test_loss, test_acc = new_model.evaluate(test_dataset)\n", | ||
"\n", | ||
"print('Test Loss:', test_loss)\n", | ||
"print('Test Accuracy:', test_acc)" | ||
], | ||
"metadata": { | ||
"id": "SWaWQJpOEyIG" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |