diff --git a/README.md b/README.md index c0ac8c60..e69de29b 100644 --- a/README.md +++ b/README.md @@ -1,84 +0,0 @@ -# bioimage_embed: Autoencoders for Biological Image Data - -bioimage_embed is an all-in-one Python package designed to cater to the needs of computational biologists, data scientists, and researchers working on biological image data. With specialized functions to handle, preprocess, and visualize microscopy datasets, this tool is tailored to streamline the embedding process for biological imagery. - -[![Build Status](https://img.shields.io/badge/build-passing-green.svg)](https://github.com/ctr26/bioimage_embed) -[![Python Version](https://img.shields.io/badge/python-3.7+-blue.svg)](https://github.com/ctr26/bioimage_embed) -[![License](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/ctr26/bioimage_embed) - ---- - -## Features - -- Seamless loading of microscopy datasets, compatible with the BioImage Data Resource and Cell Image Library. -- Built-in preprocessing functions to ensure your images are primed for encoding. -- Visual tools to dive deep into the encoding and decoding processes of your autoencoders. - ---- - -## Installation - -To get started with bioimage_embed, you can install it directly via pip or from the GitHub repository. - -### From PyPI: - -```bash -pip install bioimage_embed -``` - -### From GitHub: - -```bash -pip install git+https://github.com/ctr26/bioimage_embed -``` - ---- - -## Usage - -### 1. Basic Installation: - -```bash -pip install -e . -``` - -### 2. Command Line Interface (CLI): - -To get a list of all commands and functions: - -```bash -bioimage_embed --help -``` - -OR - -```bash -bie --help -``` - - -### 3. Developer Installation: - -For those intending to contribute or looking for a deeper dive into the codebase, we use `poetry` to manage our dependencies and virtual environments: - -```bash -poetry env use python -poetry install -poetry shell -``` - ---- - -## Support & Contribution - -For any issues, please refer to our [issues page](https://github.com/ctr26/bioimage_embed/issues). Contributions are more than welcome! Please submit pull requests to the master branch. - ---- - -## License - -bioimage_embed is licensed under the MIT License. Please refer to the [LICENSE](https://github.com/ctr26/bioimage_embed/LICENSE) for more details. - ---- - -Happy Embedding! 🧬🔬 diff --git a/pyproject.toml b/pyproject.toml index b8ee2505..59576056 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "shape_embed" version = "0.0.0" description = "" authors = ["Craig "] -packages = [{ include = "bioimage_embed" }] +packages = [{ include = "shape_embed" }] [tool.poetry.dependencies] diff --git a/scripts/idr.py b/scripts/idr.py deleted file mode 100644 index d3525289..00000000 --- a/scripts/idr.py +++ /dev/null @@ -1,5 +0,0 @@ -# %% [markdown] -# root = "/nfs/ftp/public/databases/IDR/" - -# %% [markdown] -# dataset = datasets.ImageFolder(transform=transform) diff --git a/scripts/idr/lightning.study.sh b/scripts/idr/lightning.study.sh deleted file mode 100644 index dac53e82..00000000 --- a/scripts/idr/lightning.study.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -l -#SBATCH --nodes=3 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --time=0-24:00:00 -#SBATCH --job-name=lightning -#SBATCH --constraint=a100 # Ensure the job is scheduled on nodes with A100 GPUs -#SBATCH --mem-per-cpu=2GB -#SBATCH --cpus-per-task=32 -#SBATCH --output=lightning_%j.out -set -x - -source activate $1 - -# debugging flags (optional) -export NCCL_DEBUG=INFO -export PYTHONFAULTHANDLER=1 -# export NCCL_P2P_DISABLE=1 -# unset LOCAL_RANK - -# on your cluster you might need these: -# set the network interface -# export NCCL_SOCKET_IFNAME=^docker0,lo - -# might need the latest CUDA -# module load NCCL/2.4.7-1-cuda.10.0 - -# run script from above -echo "Starting Lightning training script" -srun python3 scripts/idr/study.py \ No newline at end of file diff --git a/scripts/idr/study.py b/scripts/idr/study.py deleted file mode 100644 index 1790b1cb..00000000 --- a/scripts/idr/study.py +++ /dev/null @@ -1,127 +0,0 @@ -import bioimage_embed -import bioimage_embed.config as config -# from ray.tune.integration.pytorch_lightning import ( -# TuneReportCallback, -# TuneReportCheckpointCallback, - -# ) -import albumentations as A -from types import SimpleNamespace -from ray import tune -import numpy as np -from ray.train.torch import TorchTrainer -from ray.train import ScalingConfig -from hydra.utils import instantiate -import ray -from ray.train.lightning import ( - RayDDPStrategy, - RayLightningEnvironment, - RayTrainReportCallback, - prepare_trainer, -) -import os -import glob -from PIL import Image -from typing import List -from torch.utils.data import Dataset -import torch -from joblib import Memory -from pydantic.dataclasses import dataclass -from pytorch_lightning import loggers as pl_loggers -params = { - "model": "resnet50_vqvae", - # "data": "data", - "opt": "adamw", - "max_epochs": 1000, - "max_steps": -1, - "weight_decay":0.0001, - "momentum": 0.9, - # "sched": "cosine", - "epochs": 1000, - "lr": 1e-3, - "batch_size": 16, - } -memory = Memory(location='.', verbose=0) - -@memory.cache -def get_file_list(glob_str): - return glob.glob(os.path.join(glob_str), recursive=True) - - -class GlobDataset(Dataset): - def __init__(self, glob_str,transform=None): - self.file_list = get_file_list(glob_str) - self.transform = transform - - def __len__(self): - return len(self.file_list) - - def __getitem__(self, idx): - if torch.is_tensor(idx): - idx = idx.tolist() - - img_name = self.file_list[idx] - image = Image.open(img_name) - # breakpoint() - image = np.array(image) - if self.transform: - # t = A.Compose([A.ToRGB(),transform, A.RandomCrop(224,224)]) - t = A.Compose([A.ToRGB(),self.transform]) - image = t(image=image) - - # breakpoint() - # sample = {'image': image, 'path': img_name} - - return image["image"], 0 - -root_dir = '/nfs/ftp/public/databases/IDR/idr0093-mueller-perturbation/' -root_dir = '/nfs/research/uhlmann/ctr26/idr/idr0093-mueller-perturbation/' - -if __name__ == "__main__": - print("training") - input_dim = [3, 224, 224] - - # mock_dataset = config.ImageFolderDataset( - # _target_="bioimage_embed.datasets.FakeImageFolder", - # image_size=input_dim, - # num_classes=1, - # ) - # breakpoint() - transform = instantiate(config.ATransform()) - dataset = GlobDataset(root_dir+'**/*.tif*',transform) - dataloader = config.DataLoader(dataset=dataset,num_workers=32) - - assert instantiate(dataloader,batch_size=1) - assert dataset[0] - - model = config.Model(input_dim=input_dim) - - lit_model = config.LightningModel( - _target_="bioimage_embed.lightning.torch.AutoEncoderSupervised", - model=model - ) - - wandb = pl_loggers.WandbLogger(project="idr", name="0093") - trainer = config.Trainer( - accelerator="auto", - devices=1, - num_nodes=1, - # strategy="ddp", - callbacks=[], - plugin=[], - logger=[wandb], - ) - - cfg = config.Config( - dataloader=dataloader, - lit_model=lit_model, - trainer=trainer, - recipe=config.Recipe(**params), - ) - # breakpoint() - - bie = bioimage_embed.BioImageEmbed(cfg) - wandb.watch(bie.icfg.lit_model, log="all") - - bie.train() - wandb.finish() diff --git a/scripts/idr/study.submitit.py b/scripts/idr/study.submitit.py deleted file mode 100644 index af383f05..00000000 --- a/scripts/idr/study.submitit.py +++ /dev/null @@ -1,209 +0,0 @@ -import bioimage_embed -import bioimage_embed.config as config -import wandb -from pytorch_lightning import LightningModule, Trainer -import albumentations as A -from types import SimpleNamespace -from ray import tune -import numpy as np -from ray.train.torch import TorchTrainer -from ray.train import ScalingConfig -from hydra.utils import instantiate -import os -import glob -from PIL import Image -from typing import List -from torch.utils.data import Dataset -import torch -from joblib import Memory -from pydantic.dataclasses import dataclass -from pytorch_lightning import loggers as pl_loggers -import submitit -import os -import fsspec -import logging -import click -from pytorch_lightning.callbacks import ModelCheckpoint # Added import -import random -from tqdm import tqdm - - -torch.manual_seed(42) -np.random.seed(42) - -NUM_GPUS_PER_NODE = 1 -NUM_NODES = 1 -CPUS_PER_TASK = 8 - -params = { - "model": "resnet50_vqvae", - # "data": "data", - "opt": "lamb", - "latent_dim": 224**2//4, - "max_epochs": 1000, - "max_steps": -1, - "weight_decay": 0.0001, - "momentum": 0.9, - # "sched": "cosine", - "epochs": 1000, - "lr": 1e-3, - "batch_size": 16, - "sched": "cosine", - } -memory = Memory(location='.', verbose=0) - -@memory.cache -def get_file_list(glob_str,fs): - return fs.glob(glob_str) - -@memory.cache -def get_clean_file_list(glob_str, fs): - filelist = get_file_list(glob_str, fs) - # Use filter with tqdm - valid_files = list(filter(lambda x: check_image(fs,x), tqdm(filelist, desc="Validating images"))) - return valid_files - - -def collate_fn(batch): - # Filter out None values - batch = list(filter(lambda x: x[0] is not None, batch)) - if len(batch) == 0: - logging.warning("Batch is empty") - return None - return torch.utils.data.dataloader.default_collate(batch) - -class GlobDataset(Dataset): - def __init__(self, glob_str,transform=None,fs=fsspec.filesystem('file')): - print("Getting file list, this may take a while") - self.file_list = np.random.permutation(get_clean_file_list(glob_str, fs)).tolist() - - print(f"Done getting file list: {len(self.file_list)}") - self.transform = transform - - def __len__(self): - return len(self.file_list) - - def __getitem__(self, idx): - if torch.is_tensor(idx): - idx = idx.tolist() - img_name = self.file_list[idx] - try: - image = read_image(fs,img_name) - if self.transform: - image = self.transform(image=image)["image"] - return image,0 - except: - logging.info(f"Could not open {img_name}") - breakpoint() - return None, 0 - - - -def check_image(fs,img_name): - obj = fs.open(img_name,filecache={'cache_storage':'tmp/idr'}) - with obj as f: - try: - image = Image.open(f).verify() - return True - except: - return False - -def read_image(fs,img_name): - obj = fs.open(img_name,filecache={'cache_storage':'tmp/idr'}) - with obj as f: - image = Image.open(f) - image = np.array(image) - return image - - -root_dir = '/nfs/research/uhlmann/ctr26/idr/idr0093-mueller-perturbation/' -fs = fsspec.filesystem('file') -# fs = fsspec.filesystem( -# 'ftp', host='ftp.ebi.ac.uk', -# cache_storage='/tmp/files/') -root_dir = '/pub/databases/IDR/idr0093-mueller-perturbation/' -root_dir = "/hps/nobackup/uhlmann/ctr26/idr/nfs/ftp/public/databases/IDR/" -root_dir += "idr0093-mueller-perturbation/" -# /nfs/ftp/public/databases/IDR/idr0093-mueller-perturbation/' -# /nfs/ftp/public/databases/IDR/ - -def train(num_gpus_per_node=1,num_nodes=1): - - print("training") - input_dim = [3, 224, 224] - - # mock_dataset = config.ImageFolderDataset( - # _target_="bioimage_embed.datasets.FakeImageFolder", - # image_size=input_dim, - # num_classes=1, - # ) - - transform = instantiate(config.ATransform()) - transform = A.Compose([A.ToRGB(),transform]) - dataset = GlobDataset(root_dir+'**/*.tif*',transform,fs=fs) - # dataset = RandomDataset(32, 64) - dataloader = config.DataLoader(dataset=dataset,num_workers=CPUS_PER_TASK-1,collate_fn=collate_fn,shuffle=True,batch_size=params["batch_size"]) - - # assert instantiate(dataloader,batch_size=1) - # assert dataset[0] - - model = config.Model(input_dim=input_dim) - - lit_model = config.LightningModel( - # _target_="bioimage_embed.lightning.torch.AutoEncoderSupervised", - model=model - ) - wandb = pl_loggers.WandbLogger(project="idr", name="0093",log_model="all") - - - trainer = config.Trainer( - accelerator="auto", - devices=num_gpus_per_node, - num_nodes=num_nodes, - strategy="ddp", - enable_checkpointing=True, - callbacks=None, - # plugin=[], - - logger=[wandb], - ) - - cfg = config.Config( - dataloader=dataloader, - lit_model=lit_model, - trainer=trainer, - recipe=config.Recipe(**params), - ) - # breakpoint() - - bie = bioimage_embed.BioImageEmbed(cfg) - # wandb.watch(bie.icfg.lit_model, log="all") - # wandb.run.define_metric("mse/val", summary="best") - # wandb.run.define_metric("loss/val.loss", summary="best") - - bie.train() - wandb.finish() - -@click.command() -@click.option("--gpus", default=1) -@click.option("--nodes", default=1) -def main( gpus, nodes): - logdir = "lightning_slurm/" - os.makedirs(logdir, exist_ok=True) - - # executor is the submission interface (logs are dumped in the folder) - executor = submitit.AutoExecutor(folder=logdir) - executor.update_parameters( - mem_gb=2 * 32 * 4, # 2GB per CPU, 32 CPUs per task, 4 tasks per node - timeout_min=1440*2, # 48 hours - # slurm_partition="your_partition_name", # Replace with your partition name - gpus_per_node=gpus, - tasks_per_node=1, - cpus_per_task=CPUS_PER_TASK, - nodes=nodes, - slurm_constraint="a100", - ) - job = executor.submit(train, gpus, nodes) - -if __name__ == "__main__": - main() diff --git a/scripts/simple.py b/scripts/simple.py deleted file mode 100644 index 33b9be32..00000000 --- a/scripts/simple.py +++ /dev/null @@ -1,81 +0,0 @@ -# %% -import bioimage_embed -import bioimage_embed.config as config - -# Import necessary modules from bioimage_embed and config. -# bioimage_embed is likely a library designed for embedding biological images, -# and config is used to handle configurations. - -# %% -from torchvision.datasets import FakeData -from hydra.utils import instantiate - -# Import FakeData from torchvision.datasets to create a fake dataset, -# and instantiate from hydra.utils to create instances based on configuration. - -# %% -# We can instantiate a transformation from the default configuration using hydra. -transform = instantiate(config.Transform()) - -# Instantiate a transformation using the configuration provided. -# This will likely include any data augmentation or preprocessing steps defined in the configuration. - -# %% -# Create a fake dataset with 64 images of size 224x224x3 (3 channels), and 10 classes. -dataset = FakeData( - size=64, - image_size=(3, 224, 224), - num_classes=10, - transform=transform, -) - -# Create a fake dataset with 64 images of size 224x224x3 (3 channels), and 10 classes. -# This dataset will be used to simulate data for testing purposes. The 'transform' argument applies the -# transformations defined earlier to the dataset. - -# NOTE: The 'dataset' must be a PyTorch Dataset object with X (data) and y (labels). -# If using an unsupervised encoder, set the labels (y) to None; the model will ignore them during training. - -# dataset=CelebA(download=True, root="/tmp", split="train") - -# The commented-out code suggests an alternative to use the CelebA dataset. -# It would download the CelebA dataset and use the training split, storing it in the '/tmp' directory. - -# %% [markdown] -# We can declare a recipe and configuration object to train the model. -# I -# -# recipe = config.Recipe(model="resnet18_vae") -# %% [markdown] -# - -# %% -cfg = config.Config(recipe=recipe, dataset=dataset) -bie = bioimage_embed.BioImageEmbed(cfg) - -# Create a configuration object 'cfg' using the config module, and assign the fake dataset to it. -# The model is set to "resnet18_vae" and the maximum number of epochs for training is set to 100. -# Instantiate the BioImageEmbed object 'bie' using the configuration. - - -# %% -def process(): - bie.check() - bie.train() - bie.export() - - -# Define a process function that performs three steps: -# 1. 'check()' to verify the setup or configuration. -# 2. 'train()' to start training the model. -# 3. 'export()' to export the trained model. - -# %% -# This is the entrypoint for the script and very important if cfg.trainer.num_workers > 0 -if __name__ == "__main__": - process() - -# This is the entry point for the script. The 'if __name__ == "__main__":' statement ensures that the 'process()' -# function is called only when the script is run directly, not when imported as a module. -# This is crucial if the 'num_workers' parameter is set in cfg.trainer, as it prevents potential issues -# with multiprocessing in PyTorch. diff --git a/shape_embed/__init__.py b/shape_embed/__init__.py index 0e1a0d5a..8865c28d 100644 --- a/shape_embed/__init__.py +++ b/shape_embed/__init__.py @@ -1,26 +1,8 @@ import torch torch.cuda.empty_cache() -# from . import models, lightning, cli, export, config -from .lightning import AESupervised, AEUnsupervised, AE, AutoEncoderSupervised, AutoEncoderUnsupervised, AutoEncoder -# TODO: Fix this import as it currently produces too many warnings -from .models import ModelFactory, create_model -from .bie import BioImageEmbed -from .config import Config -from . import augmentations import logging logging.captureWarnings(True) -__all__ = [ - "AESupervised", - "AutoEncoderUnsupervised", - "AEUnsupervised", - "AutoEncoderSupervised", - "AutoEncoder" - "AE" - "BioImageEmbed", - "Config", - "augmentations", -]