diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml new file mode 100644 index 00000000..8828321c --- /dev/null +++ b/.github/workflows/notebooks.yaml @@ -0,0 +1,27 @@ +name: jupytext-changes + +on: + push: + # pull_request: + +jobs: + jupytext: + runs-on: ubuntu-latest + steps: + # Checkout + - uses: actions/checkout@v2 + with: + ref: ${{ github.head_ref }} + - name: Install Packages if changed files + run: | + pip install jupytext + + - name: Synch changed files + run: | + jupytext --use-source-timestamp --sync scripts/*.py + + # Auto commit any updated notebook files + - uses: stefanzweifel/git-auto-commit-action@v4 + with: + # This would be more useful if the git hash were referenced? + commit_message: "Auto-commit updated notebooks" diff --git a/.gitignore b/.gitignore index 1950a5b1..9e8fa3c5 100644 --- a/.gitignore +++ b/.gitignore @@ -328,3 +328,4 @@ manuscript !README.md !config.yaml !src +!notebooks diff --git a/notebooks/full.ipynb b/notebooks/full.ipynb new file mode 100644 index 00000000..df33d279 --- /dev/null +++ b/notebooks/full.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1c21f652", + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary modules\n", + "import bioimage_embed\n", + "import bioimage_embed.config as config\n", + "from hydra.utils import instantiate\n", + "from torchvision import datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88fb43bf", + "metadata": {}, + "outputs": [], + "source": [ + "# Define input dimensions\n", + "input_dim = [3, 224, 224]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b364758d", + "metadata": {}, + "outputs": [], + "source": [ + "# Use the default augmentation list\n", + "transform = instantiate(config.Transform())\n", + "transform.transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5e1c0e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the CelebA dataset with the specified transformations\n", + "dataset = datasets.CelebA(\n", + " root=\"data/\",\n", + " split=\"train\",\n", + " target_type=\"attr\",\n", + " download=True,\n", + " transform=transform,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35482694", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataloader from the dataset\n", + "dataloader = config.DataLoader(dataset=dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2de56894", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate the model with the input dimensions\n", + "model = config.Model(input_dim=input_dim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcbe489e", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the recipe for the model\n", + "recipe = config.Recipe(model=\"resnet18_vae\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a2be1b", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the configuration object with the recipe, dataloader, and model\n", + "cfg = config.Config(recipe=recipe, dataloader=dataloader, model=model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "949f9ffb", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize BioImageEmbed with the configuration\n", + "bie = bioimage_embed.BioImageEmbed(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "717481bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Train and export the model if this script is run as the main program\n", + "if __name__ == \"__main__\":\n", + " bie.check().train().export(\"model\")\n", + "# lit_model = bie.check().train().get_model()\n", + "# bie.export(\"model\")" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/idr.ipynb b/notebooks/idr.ipynb new file mode 100644 index 00000000..6d5cd0c8 --- /dev/null +++ b/notebooks/idr.ipynb @@ -0,0 +1,29 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c657c25f", + "metadata": {}, + "source": [ + "root = \"/nfs/ftp/public/databases/IDR/\"" + ] + }, + { + "cell_type": "markdown", + "id": "e8bb1331", + "metadata": {}, + "source": [ + "dataset = datasets.ImageFolder(transform=transform)" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/simple.ipynb b/notebooks/simple.ipynb new file mode 100644 index 00000000..81a82c19 --- /dev/null +++ b/notebooks/simple.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ac6bd2ea", + "metadata": {}, + "outputs": [], + "source": [ + "import bioimage_embed\n", + "import bioimage_embed.config as config\n", + "\n", + "# Import necessary modules from bioimage_embed and config.\n", + "# bioimage_embed is likely a library designed for embedding biological images,\n", + "# and config is used to handle configurations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f28d1d38", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision.datasets import FakeData\n", + "from hydra.utils import instantiate\n", + "\n", + "# Import FakeData from torchvision.datasets to create a fake dataset,\n", + "# and instantiate from hydra.utils to create instances based on configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8a6921a", + "metadata": {}, + "outputs": [], + "source": [ + "# We can instantiate a transformation from the default configuration using hydra.\n", + "transform = instantiate(config.Transform())\n", + "\n", + "# Instantiate a transformation using the configuration provided.\n", + "# This will likely include any data augmentation or preprocessing steps defined in the configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4ab05fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a fake dataset with 64 images of size 224x224x3 (3 channels), and 10 classes.\n", + "dataset = FakeData(\n", + " size=64,\n", + " image_size=(3, 224, 224),\n", + " num_classes=10,\n", + " transform=transform,\n", + ")\n", + "\n", + "# Create a fake dataset with 64 images of size 224x224x3 (3 channels), and 10 classes.\n", + "# This dataset will be used to simulate data for testing purposes. The 'transform' argument applies the\n", + "# transformations defined earlier to the dataset.\n", + "\n", + "# NOTE: The 'dataset' must be a PyTorch Dataset object with X (data) and y (labels).\n", + "# If using an unsupervised encoder, set the labels (y) to None; the model will ignore them during training.\n", + "\n", + "# dataset=CelebA(download=True, root=\"/tmp\", split=\"train\")\n", + "\n", + "# The commented-out code suggests an alternative to use the CelebA dataset.\n", + "# It would download the CelebA dataset and use the training split, storing it in the '/tmp' directory." + ] + }, + { + "cell_type": "markdown", + "id": "4f41bc2a", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + "We can declare a recipe and configuration object to train the model.\n", + "I\n", + "\n", + "recipe = config.Recipe(model=\"resnet18_vae\")" + ] + }, + { + "cell_type": "markdown", + "id": "199e26e3", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ba45ad", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "cfg = config.Config(recipe=recipe, dataset=dataset)\n", + "bie = bioimage_embed.BioImageEmbed(cfg)\n", + "\n", + "# Create a configuration object 'cfg' using the config module, and assign the fake dataset to it.\n", + "# The model is set to \"resnet18_vae\" and the maximum number of epochs for training is set to 100.\n", + "# Instantiate the BioImageEmbed object 'bie' using the configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cf97080", + "metadata": {}, + "outputs": [], + "source": [ + "def process():\n", + " bie.check()\n", + " bie.train()\n", + " bie.export()\n", + "\n", + "\n", + "# Define a process function that performs three steps:\n", + "# 1. 'check()' to verify the setup or configuration.\n", + "# 2. 'train()' to start training the model.\n", + "# 3. 'export()' to export the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4fa9482", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the entrypoint for the script and very important if cfg.trainer.num_workers > 0\n", + "if __name__ == \"__main__\":\n", + " process()\n", + "\n", + "# This is the entry point for the script. The 'if __name__ == \"__main__\":' statement ensures that the 'process()'\n", + "# function is called only when the script is run directly, not when imported as a module.\n", + "# This is crucial if the 'num_workers' parameter is set in cfg.trainer, as it prevents potential issues\n", + "# with multiprocessing in PyTorch." + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/full.py b/scripts/full.py new file mode 100644 index 00000000..a395abb8 --- /dev/null +++ b/scripts/full.py @@ -0,0 +1,52 @@ +# %% +# Import necessary modules +import bioimage_embed +import bioimage_embed.config as config +from hydra.utils import instantiate +from torchvision import datasets + +# %% +# Define input dimensions +input_dim = [3, 224, 224] + +# %% +# Use the default augmentation list +transform = instantiate(config.Transform()) +transform.transform + +# %% +# Load the CelebA dataset with the specified transformations +dataset = datasets.CelebA( + root="data/", + split="train", + target_type="attr", + download=True, + transform=transform, +) + +# %% +# Create a dataloader from the dataset +dataloader = config.DataLoader(dataset=dataset) + +# %% +# Instantiate the model with the input dimensions +model = config.Model(input_dim=input_dim) + +# %% +# Define the recipe for the model +recipe = config.Recipe(model="resnet18_vae") + +# %% +# Create the configuration object with the recipe, dataloader, and model +cfg = config.Config(recipe=recipe, dataloader=dataloader, model=model) + +# %% +# Initialize BioImageEmbed with the configuration +bie = bioimage_embed.BioImageEmbed(cfg) + +# %% +# Train and export the model if this script is run as the main program +if __name__ == "__main__": + bie.check().train().export("model") +# lit_model = bie.check().train().get_model() +# bie.export("model") diff --git a/scripts/idr.py b/scripts/idr.py new file mode 100644 index 00000000..d3525289 --- /dev/null +++ b/scripts/idr.py @@ -0,0 +1,5 @@ +# %% [markdown] +# root = "/nfs/ftp/public/databases/IDR/" + +# %% [markdown] +# dataset = datasets.ImageFolder(transform=transform) diff --git a/scripts/simple.py b/scripts/simple.py index af8b53b6..33b9be32 100644 --- a/scripts/simple.py +++ b/scripts/simple.py @@ -14,12 +14,14 @@ # and instantiate from hydra.utils to create instances based on configuration. # %% +# We can instantiate a transformation from the default configuration using hydra. transform = instantiate(config.Transform()) # Instantiate a transformation using the configuration provided. # This will likely include any data augmentation or preprocessing steps defined in the configuration. # %% +# Create a fake dataset with 64 images of size 224x224x3 (3 channels), and 10 classes. dataset = FakeData( size=64, image_size=(3, 224, 224), @@ -39,13 +41,16 @@ # The commented-out code suggests an alternative to use the CelebA dataset. # It would download the CelebA dataset and use the training split, storing it in the '/tmp' directory. +# %% [markdown] +# We can declare a recipe and configuration object to train the model. +# I +# +# recipe = config.Recipe(model="resnet18_vae") # %% [markdown] # # %% -cfg = config.Config(dataset=dataset) -cfg.recipe.model = "resnet18_vae" -cfg.recipe.max_epochs = 100 +cfg = config.Config(recipe=recipe, dataset=dataset) bie = bioimage_embed.BioImageEmbed(cfg) # Create a configuration object 'cfg' using the config module, and assign the fake dataset to it.