Skip to content
This repository has been archived by the owner on Aug 11, 2023. It is now read-only.

Commit

Permalink
Add ML notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
RealHinome authored Aug 10, 2023
1 parent 685c5ee commit 54a79ca
Showing 1 changed file with 396 additions and 0 deletions.
396 changes: 396 additions & 0 deletions models/HTML_Verificator.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,396 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Phishing website detection based on HTML content\n",
"> We are using a basic text classifier (RNN) to determine if the website is a phishing one, or legitm one.\n"
],
"metadata": {
"id": "TK3Htc334KFg"
}
},
{
"cell_type": "markdown",
"source": [
"Import required modules"
],
"metadata": {
"id": "UVkHdQ1m-XcS"
}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "neTAE4jjyO7y"
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"import tensorflow_datasets as tfds\n",
"import tensorflow as tf\n",
"\n",
"tf.get_logger().setLevel('ERROR')"
]
},
{
"cell_type": "markdown",
"source": [
"Set basic variables"
],
"metadata": {
"id": "9jQSROvu_JeE"
}
},
{
"cell_type": "code",
"source": [
"AUTOTUNE = tf.data.AUTOTUNE\n",
"batch_size = 64\n",
"seed = 42"
],
"metadata": {
"id": "ehg828Gb_I-2"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Set dataset"
],
"metadata": {
"id": "RGIa2byQ-fyg"
}
},
{
"cell_type": "code",
"source": [
"train_dataset = tf.keras.utils.text_dataset_from_directory(\n",
" 'drive/MyDrive/is_ai',\n",
" batch_size=batch_size,\n",
" validation_split=0.2,\n",
" subset='training',\n",
" seed=seed)\n",
"\n",
"class_names = train_dataset.class_names\n",
"train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)\n",
"\n",
"test_dataset = tf.keras.utils.text_dataset_from_directory(\n",
" 'drive/MyDrive/is_ai',\n",
" batch_size=batch_size,\n",
" validation_split=0.2,\n",
" subset='validation',\n",
" seed=seed)\n",
"\n",
"test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)"
],
"metadata": {
"id": "vCu3Mlth-StS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for example, label in train_dataset.take(1):\n",
" print('text: ', example.numpy()[:2])\n",
" print('label: ', label.numpy()[:2])"
],
"metadata": {
"id": "54KG28zBACtD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Create the text encoder"
],
"metadata": {
"id": "NpEl6ELfAtaM"
}
},
{
"cell_type": "code",
"source": [
"VOCAB_SIZE = 1000\n",
"encoder = tf.keras.layers.TextVectorization(\n",
" max_tokens=VOCAB_SIZE)\n",
"encoder.adapt(train_dataset.map(lambda text, label: text))"
],
"metadata": {
"id": "XZ9M9KRLAs1R"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Create model"
],
"metadata": {
"id": "4isCe5e-A8gE"
}
},
{
"cell_type": "code",
"source": [
"model = tf.keras.Sequential([\n",
" encoder,\n",
" tf.keras.layers.Embedding(\n",
" input_dim=len(encoder.get_vocabulary()),\n",
" output_dim=64,\n",
" # Use masking to handle the variable sequence lengths\n",
" mask_zero=True),\n",
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),\n",
" tf.keras.layers.Dense(64, activation='relu'),\n",
" tf.keras.layers.Dense(1)\n",
"])"
],
"metadata": {
"id": "YbUo_DGDA64b"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Compile model to start training"
],
"metadata": {
"id": "EsyuV54LBIgj"
}
},
{
"cell_type": "code",
"source": [
"model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
" optimizer=tf.keras.optimizers.Adam(1e-4),\n",
" metrics=['accuracy'])"
],
"metadata": {
"id": "3r6kKxy2BGTo"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Train model"
],
"metadata": {
"id": "17cC9GQCBMoT"
}
},
{
"cell_type": "code",
"source": [
"history = model.fit(train_dataset, epochs=10,\n",
" validation_data=test_dataset,\n",
" validation_steps=30)"
],
"metadata": {
"id": "XJmqmLuNBN_B"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_loss, test_acc = model.evaluate(test_dataset)\n",
"\n",
"print('Test Loss:', test_loss)\n",
"print('Test Accuracy:', test_acc)"
],
"metadata": {
"id": "_dcLkaISB80u"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Stack second model"
],
"metadata": {
"id": "InWdPNuHCDtN"
}
},
{
"cell_type": "code",
"source": [
"model = tf.keras.Sequential([\n",
" encoder,\n",
" tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),\n",
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),\n",
" tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n",
" tf.keras.layers.Dense(64, activation='relu'),\n",
" tf.keras.layers.Dropout(0.5),\n",
" tf.keras.layers.Dense(1)\n",
"])"
],
"metadata": {
"id": "X3iysmQyCDJS"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
" optimizer=tf.keras.optimizers.Adam(1e-4),\n",
" metrics=['accuracy'])"
],
"metadata": {
"id": "RmFPpid7CFSO"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"history = model.fit(train_dataset, epochs=10,\n",
" validation_data=test_dataset,\n",
" validation_steps=30)"
],
"metadata": {
"id": "O7ca3jIACGXw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_loss, test_acc = model.evaluate(test_dataset)\n",
"\n",
"print('Test Loss:', test_loss)\n",
"print('Test Accuracy:', test_acc)"
],
"metadata": {
"id": "TZGpbPgeClPg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Use model"
],
"metadata": {
"id": "5Dzi4Au7CtdZ"
}
},
{
"cell_type": "code",
"source": [
"text_text = \"\"\n",
"\n",
"for text, _ in train_dataset.take(1):\n",
" text_text = text[:1]"
],
"metadata": {
"id": "JaVwb1DVFegW"
},
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"source": [
"predictions = model.predict(text_text)\n",
"print(predictions)"
],
"metadata": {
"id": "DkqLwoyBCqfp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.save(\"phishing\")"
],
"metadata": {
"id": "cBT4oyUXDI5l"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Use model"
],
"metadata": {
"id": "AyuilBw8ESAX"
}
},
{
"cell_type": "code",
"source": [
"new_model = tf.keras.models.load_model(\"phishing\")"
],
"metadata": {
"id": "AjxUyjudE1Ju"
},
"execution_count": 21,
"outputs": []
},
{
"cell_type": "code",
"source": [
"predictions = new_model.predict(text_text)\n",
"print(predictions)"
],
"metadata": {
"id": "oIm9OPBGERbv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test_loss, test_acc = new_model.evaluate(test_dataset)\n",
"\n",
"print('Test Loss:', test_loss)\n",
"print('Test Accuracy:', test_acc)"
],
"metadata": {
"id": "SWaWQJpOEyIG"
},
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit 54a79ca

Please sign in to comment.