diff --git a/.gitignore b/.gitignore index 37edb8664..74437381d 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,69 @@ multirun/ # SLURM slurm-*.out + +# Ignore all data, model checkpoint, and mlruns files +modulus/examples/cfd/darcy_fno/data/ +modulus/examples/cfd/darcy_fno/checkpoint/ +modulus/examples/cfd/darcy_fno/mlruns/ +modulus/examples/**/*.pt + +############################################################################## +# Ignore all data, checkpoint, and mlruns directories under both: +# - modulus/examples/ +# - examples/ +############################################################################## +modulus/examples/**/data/ +modulus/examples/**/checkpoint/ +modulus/examples/**/mlruns/ +examples/**/data/ +examples/**/checkpoint/ +examples/**/mlruns/ + +############################################################################## +# Ignore all .pt files under: +# - modulus/examples/ +# - examples/ +############################################################################## +modulus/examples/**/*.pt +examples/**/*.pt + +############################################################################## +# EXCEPTIONS: Do NOT ignore files that start with "example_*", +# "final", or any specifically whitelisted items like "data/example_small_batch.pt" +############################################################################## + +# 1) Don't ignore .pt files starting with "example_" in either path +!modulus/examples/**/example_*.pt +!examples/**/example_*.pt + +# 2) Specifically allow data/example_small_batch.pt in the repo root if needed +!data/example_small_batch.pt + +# 3) Don't ignore anything named "example*" or "final*" under mlruns in either path +!modulus/examples/**/mlruns/**/example* +!modulus/examples/**/mlruns/**/final* +!examples/**/mlruns/**/example* +!examples/**/mlruns/**/final* + +# 4) Allow "checkpoints/example_final.ckpt" if that's in the repo root or other path +!checkpoints/example_final.ckpt + +# 5) More fine-grained "example*" or "final*" exceptions under data/ and checkpoint/ +!modulus/examples/**/data/**/example* +!modulus/examples/**/data/**/example/* +!modulus/examples/**/data/**/final* +!modulus/examples/**/data/**/final/* +!modulus/examples/**/checkpoint/**/example* +!modulus/examples/**/checkpoint/**/example/* +!modulus/examples/**/checkpoint/**/final* +!modulus/examples/**/checkpoint/**/final* + +!examples/**/data/**/example* +!examples/**/data/**/example/* +!examples/**/data/**/final* +!examples/**/data/**/final/* +!examples/**/checkpoint/**/example* +!examples/**/checkpoint/**/example/* +!examples/**/checkpoint/**/final* +!examples/**/checkpoint/**/final* diff --git a/README.md b/README.md index ab968faa8..b49dfeb03 100644 --- a/README.md +++ b/README.md @@ -1,367 +1,10 @@ -# NVIDIA Modulus +# Modulus Fork by YGMaerz - -[![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![GitHub](https://img.shields.io/github/license/NVIDIA/modulus)](https://github.com/NVIDIA/modulus/blob/master/LICENSE.txt) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) - -[**Getting Started**](#getting-started) -| [**Install guide**](#installation) -| [**Contributing Guidelines**](#contributing-to-modulus) -| [**Resources**](#resources) -| [**Communication**](#communication) -| [**License**](#license) +This fork of [NVIDIA/modulus](https://github.com/NVIDIA/modulus) adds a **Darcy Flow** +PDE pipeline with **AutoML** and **Active Learning** under: +`examples/cfd/darcy_autoML_active_learning/`. -## What is Modulus? - -NVIDIA Modulus is an open-source deep-learning framework for building, training, and fine-tuning -deep learning models using state-of-the-art SciML methods for AI4science and engineering. - -Modulus provides utilities and optimized pipelines to develop AI models that combine -physics knowledge with data, enabling real-time predictions. - -Whether you are exploring the use of Neural operators, GNNs, or transformers or are -interested in Physics-informed Neural Networks or a hybrid approach in between, Modulus -provides you with an optimized stack that will enable you to train your models at scale. - - -

- Modulus -

- - - - -- [More About Modulus](#more-about-modulus) - - [Scalable GPU-optimized training Library](#scalable-gpu-optimized-training-library) - - [A suite of Physics-Informed ML Models](#a-suite-of-physics-informed-ml-models) - - [Seamless PyTorch Integration](#seamless-pytorch-integration) - - [Easy Customization and Extension](#easy-customization-and-extension) - - [AI4Science Library](#ai4science-library) - - [Domain Specific Packages](#domain-specific-packages) -- [Who is contributing to Modulus](#who-is-using-and-contributing-to-modulus) -- [Why use Modulus](#why-are-they-using-modulus) -- [Getting Started](#getting-started) -- [Resources](#resources) -- [Installation](#installation) -- [Contributing](#contributing-to-modulus) -- [Communication](#communication) -- [License](#license) - - - -## More About Modulus - -At a granular level, Modulus provides a library of a few key components: - - -Component | Description | ----- | --- | -[**modules.models**](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.models.html) | A collection of optimized, customizable, and easy-to-use models such as Fourier Neural Operators, Graph Neural Networks, and many more| -[**modulus.datapipes**](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.datapipes.html) | A data pipeline and data loader library, including benchmark datapipes, weather daptapipes, and graph datapipes| -[**modulus.distributed**](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.distributed.html) | A distributed computing library build on top of `torch.distributed` to enable parallel training with just a few steps| -[**modulus.sym.geometry**](https://docs.nvidia.com/deeplearning/modulus/modulus-sym/user_guide/features/csg_and_tessellated_module.html) | A library to handle geometry for DL training using the Constructive Solid Geometry modeling and CAD files in STL format.| -[**modulus.sym.eq**](https://docs.nvidia.com/deeplearning/modulus/modulus-sym/user_guide/features/nodes.html) | A library to use PDEs in your DL training with several implementations of commonly observed equations and easy ways for customization.| - - -For a complete list, refer to the Modulus API documentation for -[Modulus Core](https://docs.nvidia.com/deeplearning/modulus/modulus-core/index.html) and -[Modulus Sym](https://docs.nvidia.com/deeplearning/modulus/modulus-sym/api/api_index.html). - -Usually, Modulus is used either as: - -- A complementary tool to Pytorch when exploring AI for SciML and AI4Science applications. -- A deep learning research platform that provides scale and optimal performance on -NVIDIA GPUs. - -Elaborating Further: - -### Scalable GPU-optimized training Library - -Modulus provides a highly optimized and scalable training library for maximizing the -power of NVIDIA GPUs. -[Distributed computing](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.distributed.html) -utilities allow for efficient scaling from a single GPU to multi-node GPU clusters with -a few lines of code, ensuring that large-scale. -physics-informed machine learning (ML) models can be trained quickly and effectively. -The framework includes support for advanced. -[optimization utilities](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.utils.html#module-modulus.utils.capture), -[tailor made datapipes](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.datapipes.html), -[validation utilities](https://github.com/NVIDIA/modulus-sym/tree/main/modulus/sym/eq) -to enhance the end to end training speed. - -### A suite of Physics Informed ML Models - -Modulus offers a comprehensive library of state-of-the-art models specifically designed -for physics-ML applications. -The [Model Zoo](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.models.html#model-zoo) -includes generalizable model architectures such as -[Fourier Neural Operators (FNOs)](modulus/models/fno), -[DeepONet](https://docs.nvidia.com/deeplearning/modulus/modulus-sym/user_guide/neural_operators/deeponet.html), -[Physics-Informed Neural Networks (PINNs)](https://docs.nvidia.com/deeplearning/modulus/modulus-sym/user_guide/foundational/1d_wave_equation.html), -[Graph Neural Networks (GNNs)](modulus/models/gnn_layers), -and generative AI models like [Diffusion Models](modulus/models/diffusion) -as well as domain-specific models such as [Deep Learning Weather Prediction (DLWP)](modulus/models/dlwp) -and [Super Resolution Network (SrNN)](modulus/models/srrn) among others. -These models are optimized for various physics domains, such as computational fluid -dynamics, structural mechanics, and electromagnetics. Users can download, customize, and -build upon these models to suit their specific needs, significantly reducing the time -required to develop high-fidelity simulations. - -### Seamless PyTorch Integration - -Modulus is built on top of PyTorch, providing a familiar and user-friendly experience -for those already proficient with PyTorch. -This includes a simple Python interface and modular design, making it easy to use -Modulus with existing PyTorch workflows. -Users can leverage the extensive PyTorch ecosystem, including its libraries and tools -while benefiting from Modulus's specialized capabilities for physics-ML. This seamless -integration ensures users can quickly adopt Modulus without a steep learning curve. - -For more information, refer [Converting PyTorch Models to Modulus Models](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.models.html#converting-pytorch-models-to-modulus-models) - -### Easy Customization and Extension - -Modulus is designed to be highly extensible, allowing users to add new functionality -with minimal effort. The framework provides Pythonic APIs for -defining new physics models, geometries, and constraints, making it easy to extend its -capabilities to new use cases. -The adaptability of Modulus is further enhanced by key features such as -[ONNX support](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.deploy.html) -for flexible model deployment, -robust [logging utilities](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.launch.logging.html) -for streamlined error handling, -and efficient -[checkpointing](https://docs.nvidia.com/deeplearning/modulus/modulus-core/api/modulus.launch.utils.html#module-modulus.launch.utils.checkpoint) -to simplify model loading and saving. - -This extensibility ensures that Modulus can adapt to the evolving needs of researchers -and engineers, facilitating the development of innovative solutions in the field of physics-ML. - -Detailed information on features and capabilities can be found in the [Modulus documentation](https://docs.nvidia.com/modulus/index.html#core). - -[Reference samples](examples/README.md) cover a broad spectrum of physics-constrained -and data-driven -workflows to suit the diversity of use cases in the science and engineering disciplines. - -> [!TIP] -> Have questions about how Modulus can assist you? Try our [Experimental] chatbot, -> [Modulus Guide](https://chatgpt.com/g/g-PXrBv20SC-modulus-guide), for answers. - -### Hello world - -You can start using Modulus in your PyTorch code as simple as shown here: - -```python -python ->>> import torch ->>> from modulus.models.mlp.fully_connected import FullyConnected ->>> model = FullyConnected(in_features=32, out_features=64) ->>> input = torch.randn(128, 32) ->>> output = model(input) ->>> output.shape -torch.Size([128, 64]) -``` - -### AI4Science Library - -- [Modulus Symbolic](https://github.com/NVIDIA/modulus-sym): This repository of - algorithms and utilities allows SciML researchers and developers to physics inform model - training and model validation. It also provides a higher level abstraction - for domain experts that is native to science and engineering. - -#### Domain Specific Packages - -The following are packages dedicated for domain experts of specific communities catering -to their unique exploration needs. - -- [Earth-2 Studio](https://github.com/NVIDIA/earth2studio): Open source project - to enable climate researchers and scientists to explore and experiment with - AI models for weather and climate. - -#### Research packages - -The following are research packages that get packaged into Modulus once they are stable. - -- [Modulus Makani](https://github.com/NVIDIA/modulus-makani): Experimental library - designed to enable the research and development of machine-learning based weather and - climate models. -- [Earth2 Grid](https://github.com/NVlabs/earth2grid): Experimental library with - utilities for working geographic data defined on various grids. -- [Earth-2 MIP](https://github.com/NVIDIA/earth2mip): Experimental library with - utilities for model intercomparison for weather and climate models. - -## Who is using and contributing to Modulus - -Modulus is an open source project and gets contributions from researchers in the SciML and -AI4science fields. While Modulus team works on optimizing the underlying SW stack, the -community collaborates and contributes model architectures, datasets, and reference -applications so we can innovate in the pursuit of developing generalizable model -architectures and algorithms. - -Some latest examples of community contributors are [HP Labs 3D Printing team](https://developer.nvidia.com/blog/spotlight-hp-3d-printing-and-nvidia-modulus-collaborate-on-open-source-manufacturing-digital-twin/), -[Stanford Cardiovascular research team](https://developer.nvidia.com/blog/enabling-greater-patient-specific-cardiovascular-care-with-ai-surrogates/), -[UIUC team](https://github.com/NVIDIA/modulus/tree/main/examples/cfd/mhd_pino), -[CMU team](https://github.com/NVIDIA/modulus/tree/main/examples/generative/diffusion) etc. - -Latest examples of research teams using Modulus are -[ORNL team](https://arxiv.org/abs/2404.05768), -[TU Munich CFD team](https://www.nvidia.com/en-us/on-demand/session/gtc24-s62237/) etc. - -Please navigate to this page for a complete list of research work leveraging Modulus. -For a list of enterprises using Modulus refer [here](https://developer.nvidia.com/modulus). - -Using Modulus and interested in showcasing your work on -[NVIDIA Blogs](https://developer.nvidia.com/blog/category/simulation-modeling-design/)? -Fill out this [proposal form](https://forms.gle/XsBdWp3ji67yZAUF7) and we will get back -to you! - -## Why are they using Modulus - -Here are some of the key benefits of Modulus for SciML model development: - - - | | ----|---|---| -|SciML Benchmarking and validation|Ease of using generalized SciML recipes with heterogenous datasets |Out of the box performance and scalability -|Modulus enables researchers to benchmark their AI model against proven architectures for standard benchmark problems with detailed domain-specific validation criteria.|Modulus enables researchers to pick from SOTA SciML architectures and use built-in data pipelines for their use case.| Modulus provides out-of-the-box performant training pipelines including optimized ETL pipelines for heterogrneous engineering and scientific datasets and out of the box scaling across multi-GPU and multi-node GPUs. - - -See what your peer SciML researchers are saying about Modulus (Coming soon). - -## Getting started - -The following resources will help you in learning how to use Modulus. The best way is to -start with a reference sample and then update it for your own use case. - -- [Using Modulus with your PyTorch model](https://docs.nvidia.com/deeplearning/modulus/modulus-core/tutorials/simple_training_example.html#using-custom-models-in-modulus) -- [Using Modulus built-in models](https://docs.nvidia.com/deeplearning/modulus/modulus-core/tutorials/simple_training_example.html#using-built-in-models) -- [Getting started Guide](https://docs.nvidia.com/deeplearning/modulus/getting-started/index.html) -- [Reference Samples](https://github.com/NVIDIA/modulus/blob/main/examples/README.md) -- [User guide Documentation](https://docs.nvidia.com/deeplearning/modulus/modulus-core/index.html) - -## Resources - -- [Getting started Webinar](https://www.nvidia.com/en-us/on-demand/session/gtc24-dlit61460/?playlistId=playList-bd07f4dc-1397-4783-a959-65cec79aa985) -- [AI4Science Modulus Bootcamp](https://github.com/openhackathons-org/End-to-End-AI-for-Science) -- [Modulus Pretrained models](https://catalog.ngc.nvidia.com/models?filters=&orderBy=scoreDESC&query=Modulus&page=&pageSize=) -- [Modulus Datasets and Supplementary materials](https://catalog.ngc.nvidia.com/resources?filters=&orderBy=scoreDESC&query=Modulus&page=&pageSize=) -- [Self-paced Modulus DLI training](https://learn.nvidia.com/courses/course-detail?course_id=course-v1:DLI+S-OV-04+V1) -- [Deep Learnning for Science and Engineering Lecture Series with Modulus](https://www.nvidia.com/en-us/on-demand/deep-learning-for-science-and-engineering/) - - [Modulus: purpose and usage](https://www.nvidia.com/en-us/on-demand/session/dliteachingkit-setk5002/) -- [Video Tutorials](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=modulus&sort=relevance&sortDir=desc) - -## Installation - -### PyPi - -The recommended method for installing the latest version of Modulus is using PyPi: - -```Bash -pip install nvidia-modulus -``` - -The installation can be verified by running the hello world example as demonstrated [here](#hello-world). - -#### Optional dependencies - -Modulus has many optional dependencies that are used in specific components. -When using pip, all dependencies used in Modulus can be installed with -`pip install nvidia-modulus[all]`. If you are developing Modulus, developer dependencies -can be installed using `pip install nvidia-modulus[dev]`. Otherwise, additional dependencies -can be installed on a case by case basis. Detailed information on installing the -optional dependencies can be found in the -[Getting Started Guide](https://docs.nvidia.com/deeplearning/modulus/getting-started/index.html). - -### NVCR Container - -The recommended Modulus docker image can be pulled from the -[NVIDIA Container Registry](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/modulus/containers/modulus) -(refer to the NGC registry for the latest tag): - -```Bash -docker pull nvcr.io/nvidia/modulus/modulus:24.09 -``` - -Inside the container, you can clone the Modulus git repositories and get started with the -examples. The below command shows the instructions to launch the modulus container and run -examples from this repo. - -```bash -docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --runtime nvidia \ ---rm -it nvcr.io/nvidia/modulus/modulus:24.09 bash -git clone https://github.com/NVIDIA/modulus.git -cd modulus/examples/cfd/darcy_fno/ -pip install warp-lang # install NVIDIA Warp to run the darcy example -python train_fno_darcy.py -``` - -For enterprise supported NVAIE container, refer [Modulus Secured Feature Branch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/modulus/containers/modulus-sfb) - -## From Source - -### Package - -For a local build of the Modulus Python package from source use: - -```Bash -git clone git@github.com:NVIDIA/modulus.git && cd modulus - -pip install --upgrade pip -pip install . -``` - -### Source Container - -To build Modulus docker image: - -```bash -docker build -t modulus:deploy \ - --build-arg TARGETPLATFORM=linux/amd64 --target deploy -f Dockerfile . -``` - -Alternatively, you can run `make container-deploy` - -To build CI image: - -```bash -docker build -t modulus:ci \ - --build-arg TARGETPLATFORM=linux/amd64 --target ci -f Dockerfile . -``` - -Alternatively, you can run `make container-ci`. - -Currently, only `linux/amd64` and `linux/arm64` platforms are supported. If using -`linux/arm64`, some dependencies like `warp-lang` might not install correctly. - -## Contributing to Modulus - -Modulus is an open source collaboration and its success is rooted in community -contribution to further the field of Physics-ML. Thank you for contributing to the -project so others can build on top of your contribution. - -For guidance on contributing to Modulus, please refer to the -[contributing guidelines](CONTRIBUTING.md). - -## Cite Modulus - -If Modulus helped your research and you would like to cite it, please refer to the [guidelines](https://github.com/NVIDIA/modulus/blob/main/CITATION.cff) - -## Communication - -- Github Discussions: Discuss new architectures, implementations, Physics-ML research, etc. -- GitHub Issues: Bug reports, feature requests, install issues, etc. -- Modulus Forum: The [Modulus Forum](https://forums.developer.nvidia.com/c/physics-simulation/modulus-physics-ml-model-framework) -hosts an audience of new to moderate-level users and developers for general chat, online -discussions, collaboration, etc. - -## Feedback - -Want to suggest some improvements to Modulus? Use our feedback form -[here](https://docs.google.com/forms/d/e/1FAIpQLSfX4zZ0Lp7MMxzi3xqvzX4IQDdWbkNh5H_a_clzIhclE2oSBQ/viewform?usp=sf_link). - -## License - -Modulus is provided under the Apache License 2.0, please see [LICENSE.txt](./LICENSE.txt) -for full license text. Enterprise SLA, support and preview access are available -under NVAIE. +See the [README in that subfolder](examples/cfd/darcy_autoML_active_learning/README.md) +for details, or the +[GETTING_STARTED.md](examples/cfd/darcy_autoML_active_learning/GETTING_STARTED.md) +which covers environment setup. diff --git a/examples/cfd/darcy_autoML_active_learning/GETTING_STARTED.md b/examples/cfd/darcy_autoML_active_learning/GETTING_STARTED.md new file mode 100644 index 000000000..f05c771e3 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/GETTING_STARTED.md @@ -0,0 +1,691 @@ +# Getting Started (getting_started.md) + +## Table of Contents + +- [1. Introduction & Context](#1-introduction--context) + *Describes the overall purpose of this guide, including prerequisites and disclaimers.* + +- [2. Set Up Git](#2-set-up-git) + *Covers configuring user info, managing SSH keys, and setting up SSH for GitHub.* + - [2.1 Configure Git User Info](#21-configure-git-user-info) + *Set your global Git username and email.* + - [2.2 Manage SSH Keys](#22-manage-ssh-keys) + *Generate a new SSH key or add an existing one to this machine.* + - [Option A: Create a New SSH Key](#option-a-create-a-new-ssh-key) + *Step-by-step commands for generating a new ed25519 key pair.* + - [Option B: Add an Existing SSH Key](#option-b-add-an-existing-ssh-key) + *Copy and configure an existing SSH key pair on this machine.* + - [2.3 Configure SSH for GitHub](#23-configure-ssh-for-github) + *Modify your `~/.ssh/config` so GitHub traffic uses your SSH key.* + +- [3. Set Up the Project](#3-set-up-the-project) + *Create a project folder and clone the necessary repositories.* + +- [4. Set Up the Machine](#4-set-up-the-machine) + *Install essential system components: NVIDIA drivers, Docker, and the NVIDIA Container Toolkit.* + - [4.1 Install NVIDIA Driver (Online)](#41-install-nvidia-driver-online) + *Use APT to install and verify the NVIDIA 525 driver.* + - [4.2 Install Docker](#42-install-docker) + *Remove old Docker packages, add Docker’s official repository, and enable non-root Docker usage.* + - [4.3 Install NVIDIA Container Toolkit](#43-install-nvidia-container-toolkit) + *Install `nvidia-docker2` and other toolkit components for GPU support in Docker.* + +- [5. Install & Run NVIDIA Modulus (24.12)](#5-install--run-nvidia-modulus-2412) + *Pull the Modulus container, launch it, and install Python dependencies.* + - [5.1 Pull Modulus Docker Image](#51-pull-modulus-docker-image) + *Download the Modulus 24.12 container from NVIDIA’s registry.* + - [5.2 Launch the Modulus Container](#52-launch-the-modulus-container) + *Run the container with GPU access, volume mounts, and environment variables.* + - [5.3 Install Python Requirements](#53-install-python-requirements) + *Install any project-specific dependencies inside the container.* + - [5.4 Exit Container](#54-exit-container) + *Properly leave the container.* + +- [6. Restarting the Container & Running Jupyter Notebook](#6-restarting-the-container--running-jupyter-notebook) + *Bring containers back online and run Jupyter for interactive development.* + - [6.1 Restart & Attach to the Container](#61-restart--attach-to-the-container) + *How to start and enter an existing container.* + - [6.2 (Optional) Launch Jupyter Notebook](#62-optional-launch-jupyter-notebook) + *Start Jupyter within the container and access it via your browser.* + +- [Appendix: Offline Installation (Air-Gapped Workflow)](#appendix-offline-installation-air-gapped-workflow) + *General outline for installing everything offline - e.g, in air-gapped environments.* + + +## 1. Introduction & Context + +This guide describes how to set up a machine for projects involving **NVIDIA Modulus (version 24.12)** on **Ubuntu 22.04 LTS**. It covers: +- Setting up Git (including configuring user info and managing SSH keys). +- Installing NVIDIA drivers, Docker, and the NVIDIA Container Toolkit. +- Pulling and running the Modulus 24.12 container. +- Installing Python dependencies and running Jupyter inside the container. + +> **Disclaimer**: +> - These steps have been tested for **online** installation only. +> - An **offline (air-gapped)** approach is possible but not thoroughly tested. See the **Appendix** (not included here) for a suggested offline workflow. + +--- + +## 2. Set Up Git + +### 2.1 Configure Git User Info + +Before cloning repositories, ensure Git is configured with your user details: + +```bash +git config --global user.email "user@example.com" +git config --global user.name "YGMaerz" +``` + +### 2.2 Manage SSH Keys + +There are two main approaches: **creating a new key** or **adding an existing key** to this machine. + +#### **Option A: Create a New SSH Key** + +1. Generate an **ed25519** key pair: + ```bash + ssh-keygen -t ed25519 -C "user@example.com" -q -N "" -f ~/.ssh/id_ed25519_key + ``` +2. Retrieve the **public** key: + ```bash + cat ~/.ssh/id_ed25519_key.pub + ``` +3. Add this public key to your Git provider (e.g., GitHub). +4. (Optional) Verify permissions: + ```bash + chmod 700 ~/.ssh + chmod 600 ~/.ssh/id_ed25519_key + chmod 644 ~/.ssh/id_ed25519_key.pub + chmod 644 ~/.ssh/known_hosts # if you have known_hosts entries + ``` + +#### **Option B: Add an Existing SSH Key** +If you already have a key pair, simply copy it into `~/.ssh/` on your new machine and ensure correct permissions as above. + +### 2.3 Configure SSH for GitHub +Add or update your `~/.ssh/config` (creating the file if it doesn’t exist): + +```bash +cat <> ~/.ssh/config + +Host github.com + AddKeysToAgent yes + IdentityFile ~/.ssh/id_ed25519_key +EOF +``` + +Then set file permissions: +```bash +chmod 700 ~/.ssh +chmod 600 ~/.ssh/config +``` + +--- + +## 3. Set Up the Project + +1. **Create a project folder** (if you haven’t already): + ```bash + cd + mkdir project + ``` +2. **Clone the repositories**: + ```bash + ssh-keyscan github.com >> ~/.ssh/known_hosts + cd project + git clone git@github.com:NVIDIA/modulus.git + git clone git@github.com:YGMaerz/modulus-dls-api.git + cd .. + ``` + +--- + +## 4. Set Up the Machine + +### 4.1 Install NVIDIA Driver (Online) + +Install the NVIDIA 525 driver on Ubuntu 22.04: + +```bash +sudo apt-get update +sudo apt-get install -y nvidia-driver-525 +sudo reboot +``` + +After reboot, verify the driver is installed: + +```bash +nvidia-smi +``` + +--- + +### 4.2 Install Docker + +1. **Remove pre-installed Docker packages** (if any): + ```bash + for pkg in docker.io docker-doc docker-compose docker-compose-v2 podman-docker containerd runc; do + sudo apt-get remove -y $pkg + done + ``` + +2. **Add Docker’s official GPG key & repository**: + ```bash + sudo apt-get update + sudo apt-get install -y ca-certificates curl + + # Create a directory for the Docker key if it doesn't exist + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg \ + -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + + # Add Docker repo to APT sources + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] \ + https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + ``` + +3. **Install Docker packages**: + ```bash + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + ``` + +4. **Enable non-root Docker usage**: + ```bash + sudo usermod -aG docker $USER + sudo reboot + ``` + +5. **Verify Docker installation**: + ```bash + sudo docker run hello-world + ``` + +--- + +### 4.3 Install NVIDIA Container Toolkit + +1. **Add the NVIDIA Container Toolkit Repository**: + + ```bash + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + ``` + +2. **(Optional) Install `nvidia-docker2`**: + ```bash + distribution=$(. /etc/os-release; echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list + + sudo apt-get update + sudo apt-get install -y nvidia-docker2 + sudo systemctl restart docker + ``` + +--- + +## 5. Install & Run NVIDIA Modulus (24.12) + +### 5.1 Pull Modulus Docker Image + +```bash +docker pull nvcr.io/nvidia/modulus/modulus:24.12 +``` + +### 5.2 Launch the Modulus Container + +1. **Navigate** to the project folder: + + ```bash + cd project/modulus-dls-api/ + ``` + +2. **Run the container** (interactive mode and published port 8888): + + ```bash + docker run --gpus all \ + --shm-size=1g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v "${PWD}:/workspace" \ + -e PROJECT_ROOT="/workspace" \ + --name my_modulus_container \ + -it \ + -p 8888:8888 \ + nvcr.io/nvidia/modulus/modulus:24.12 bash + ``` + +3. **Configure the container**: + Inside the container, set the `PROJECT_ROOT` environment variable: + ```bash + # export PROJECT_ROOT="$HOME/project/modulus-dls-api" + echo 'export PROJECT_ROOT="$HOME/project/modulus-dls-api"' >> ~/.bashrc + source ~/.bashrc + ``` + This will allow the notebooks to find the project root. + +### 5.3 Install Python Requirements +Inside the container, install any project packages incl. project-specific dependencies: +```bash +# pip install -r examples/cfd/darcy_autoML_active_learning/requirements.txt +cd examples/cfd/darcy_autoML_active_learning +pip install -e . +``` + +### 5.4 Exit Container +- **Leave** the container: + ```bash + exit + ``` + +--- + +## 6. Restarting the Container & Running Jupyter Notebook + +### 6.1 Restart & Attach to the Container +If you exited your running container, you can easily restart and reattach to it: +```bash +docker start my_modulus_container +docker exec -it my_modulus_container bash +``` + +### 6.2 (Optional) Launch Jupyter Notebook +1. **Start Jupyter** inside the container: + ```bash + cd project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning # optional + jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --no-browser + ``` + +2. **Access Jupyter** on your host: + - Ensure you **published** port `8888` when first running the container (using `-p 8888:8888`). + - On your host machine, open `http://localhost:8888` in a browser. + - You’ll see the Jupyter Notebook interface, allowing you to create and run notebooks within the container environment. + +--- + +Below is an **improved appendix** that more consistently reflects the fact that your project **definitely requires** Python dependencies (rather than leaving it optional). It also incorporates more precise references to your existing instructions in the main guide. Feel free to adjust wording or headings as needed in your final `getting_started.md` file. + +--- + +# Appendix: Offline Installation (Air-Gapped Workflow) + +> **Disclaimer**: This approach is **not fully tested**. Adjust filenames/versions to your environment. + +## A. Overview + +The main steps in this Appendix mirror those in the online setup but involve **downloading** everything (drivers, Docker images, Python dependencies, etc.) on an **online machine** and **transferring** them to the **offline** system. We assume your offline system also runs **Ubuntu 22.04** and has a comparable environment. + +--- + +## B. Prepare on an Online Machine + +1. **Gather Ubuntu .deb Packages** + - NVIDIA Driver (e.g., **525**), Docker packages (`docker-ce`, `docker-ce-cli`, etc.), and the **NVIDIA Container Toolkit** (`nvidia-docker2`, `nvidia-container-toolkit`). + - On the online machine, run: + ```bash + sudo apt-get update + apt-get download + ``` + - This saves `.deb` files for each package locally, rather than installing them immediately. + +2. **Pull & Save the NVIDIA Modulus Docker Image** + - Pull the image: + ```bash + docker pull nvcr.io/nvidia/modulus/modulus:24.12 + ``` + - Save it to a `.tar` file: + ```bash + docker save -o modulus_24.12.tar nvcr.io/nvidia/modulus/modulus:24.12 + ``` + +3. **Clone or Archive Your PDE/AI Code** + - If you need your local PDE code from GitHub (e.g., the `modulus-dls-api` fork or additional repos): + ```bash + git clone git@github.com:NVIDIA/modulus.git + git clone git@github.com:YGMaerz/modulus-dls-api.git + ``` + - Optionally **compress** them into `.tar.gz` archives. + +4. **Download Python Dependencies** + - If your offline project (Darcy Flow, PDE surrogates, etc.) includes a `requirements.txt` (or `pyproject.toml`), **download** the wheels: + ```bash + pip download -r requirements.txt -d ./offline_wheels + ``` + - This allows offline `pip install` without needing PyPI. + +5. **Transfer Files** + - Copy all `.deb` packages, the `modulus_24.12.tar` Docker image, your code archives, and the `offline_wheels` folder onto a USB drive or external HDD. + - Move them to your **offline** machine. + +--- + +## C. Install on the Offline Machine + +1. **Install Ubuntu Packages from `.deb`** + - For the **NVIDIA driver**: + ```bash + sudo dpkg -i nvidia-driver-525_*.deb + sudo reboot + ``` + Check with `nvidia-smi`. + - For **Docker** and dependencies: + ```bash + sudo dpkg -i docker-ce_*.deb docker-ce-cli_*.deb containerd.io_*.deb ... + sudo apt-get install -f # resolves missing dependencies + sudo usermod -aG docker $USER # if you want non-root Docker + sudo reboot + ``` + - For the **NVIDIA Container Toolkit**: + ```bash + sudo dpkg -i nvidia-container-toolkit_*.deb nvidia-docker2_*.deb + sudo systemctl restart docker + ``` + +2. **Load the Modulus Docker Image** + ```bash + docker load -i modulus_24.12.tar + ``` + You should see `nvcr.io/nvidia/modulus/modulus:24.12` upon running `docker images`. + +3. **Unpack (or Place) Your Code** + - If you archived them (`modulus-dls-api.tar.gz` etc.), run: + ```bash + tar -xzf modulus-dls-api.tar.gz + ``` + - Move them into e.g. `~/project/modulus-dls-api/`. + +4. **Install Project’s Python Dependencies** + - If you have an offline wheels folder, run: + ```bash + pip install --no-index --find-links=./offline_wheels \ + -r requirements.txt + ``` + - Or, if you have a local `setup.py` or `pyproject.toml`, do: + ```bash + pip install -e . + ``` + - This ensures your PDE/AI packages are installed offline. + +--- + +## D. Usage & Verification + +1. **Start the Modulus Container** + ```bash + cd ~/project/modulus-dls-api/ + docker run --gpus all \ + --shm-size=1g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v "${PWD}:/workspace" \ + --name my_modulus_container \ + -it \ + nvcr.io/nvidia/modulus/modulus:24.12 bash + ``` + > If you need Jupyter, add `-p 8888:8888` for port binding. + +2. **Install Additional Python Requirements** (If Not Done): + ```bash + pip install -r examples/cfd/darcy_autoML_active_learning/requirements.txt + ``` + And/or: + ```bash + pip install -e examples/cfd/darcy_autoML_active_learning + ``` + if you have a local Python package. + +3. **Run Jupyter (Optional)** + ```bash + jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --no-browser + ``` + Then open `http://localhost:8888` in your browser (outside the container). + +4. **Stop / Restart** + - To exit: + ```bash + exit + ``` + - To restart: + ```bash + docker start my_modulus_container + docker exec -it my_modulus_container bash + ``` + +--- + +# Appendix: Offline Installation (Air-Gapped Workflow) + +## 1. Prepare on an Online Machine + +### 1.1 Gather Required Packages + +1. **Ubuntu packages** + You’ll need `.deb` packages for: + - **NVIDIA Driver** (e.g., 525) if it’s not already installed on your offline machine. + - **Docker** (Docker CE, CLI, containerd, etc.). + - **NVIDIA Container Toolkit** (e.g., `nvidia-docker2`, `nvidia-container-toolkit`). + + > **Tip**: On the online machine, configure the same Ubuntu version (22.04) repositories, then do: + > ```bash + > apt-get update + > apt-get download + > ``` + > This will download `.deb` files locally instead of installing them. You can also check official repositories or use apt-cacher methods. + +2. **NVIDIA Modulus Docker Image** + - Pull the image on the online machine: + ```bash + docker pull nvcr.io/nvidia/modulus/modulus:24.12 + ``` + - **Save** the image to a file: + ```bash + docker save -o modulus_24.12.tar nvcr.io/nvidia/modulus/modulus:24.12 + ``` + - Later, you’ll transfer `modulus_24.12.tar` to your offline machine and load it into Docker. + +3. **Git Repositories or Source Code** + - If your offline machine won’t have direct GitHub access, **clone** or **archive** the repositories on the online machine. + ```bash + git clone git@github.com:NVIDIA/modulus.git + git clone git@github.com:YGMaerz/modulus-dls-api.git + ``` + - You can also compress them: + ```bash + tar -czf modulus.tar.gz modulus + tar -czf modulus-dls-api.tar.gz modulus-dls-api + ``` + +4. **Python Dependencies** + - If the project has a `requirements.txt` file, you can **download wheels** using: + ```bash + pip download -r requirements.txt -d ./offline_wheels + ``` + - Transfer the entire `offline_wheels` folder to the offline machine. + +### 1.2 Transfer Files to Offline Machine + +1. **Copy everything** (the `.deb` packages, `.tar` Docker images, zipped repositories, Python wheels, etc.) onto removable media (USB drive, external HDD). +2. **Move** them to your offline machine. + +--- + +## **2. Install on the Offline Machine** + +Once you have all required files on the offline machine, follow these steps: + +### 2.1 Install Ubuntu Packages from `.deb` Files + +1. **NVIDIA Driver** + - If your system doesn’t already have the correct driver installed, install the `.deb` package(s) you downloaded: + ```bash + sudo dpkg -i nvidia-driver-525_*.deb + ``` + - Reboot to load the new driver: + ```bash + sudo reboot + ``` + - Verify: + ```bash + nvidia-smi + ``` + +2. **Docker Engine & Dependencies** + - Remove any existing Docker packages (optional but recommended): + ```bash + for pkg in docker.io docker-doc docker-compose docker-compose-v2 \ + podman-docker containerd runc; do + sudo apt-get remove -y $pkg + done + ``` + - Install `.deb` packages for Docker (e.g., `docker-ce`, `docker-ce-cli`, `containerd.io`, `docker-compose-plugin`, etc.): + ```bash + sudo dpkg -i docker-ce_*.deb + sudo dpkg -i docker-ce-cli_*.deb + sudo dpkg -i containerd.io_*.deb + sudo dpkg -i docker-buildx-plugin_*.deb + sudo dpkg -i docker-compose-plugin_*.deb + ``` + - (Optional) If there are dependency issues, run: + ```bash + sudo apt-get install -f + ``` + - Add your user to the `docker` group and reboot or re-login: + ```bash + sudo usermod -aG docker $USER + sudo reboot + ``` + - Test Docker: + ```bash + sudo docker run hello-world + ``` + +3. **NVIDIA Container Toolkit** + - Install `.deb` packages for `nvidia-docker2`, `nvidia-container-toolkit`, or relevant `.deb` files. + ```bash + sudo dpkg -i nvidia-container-toolkit_*.deb + sudo dpkg -i nvidia-docker2_*.deb + ``` + - Restart Docker so it picks up the new runtime: + ```bash + sudo systemctl restart docker + ``` + +### 2.2 Load the NVIDIA Modulus Docker Image + +1. **Load from Saved Tar** + ```bash + docker load -i modulus_24.12.tar + ``` + This imports the image `nvcr.io/nvidia/modulus/modulus:24.12` into your local Docker registry. + +2. **Verify** + ```bash + docker images + ``` + You should see `nvcr.io/nvidia/modulus/modulus:24.12` in the list. + +### 2.3 Prepare Git Repos / Project Files + +If you transferred the repos as `.tar.gz` archives: +```bash +tar -xzf modulus.tar.gz +tar -xzf modulus-dls-api.tar.gz +``` +Place them into your desired `project/` directory. + +- If you’re using **SSH keys** on the offline machine, ensure you have your `~/.ssh` directory set up with the right permissions: + ```bash + chmod 700 ~/.ssh + chmod 600 ~/.ssh/id_ed25519_key + chmod 644 ~/.ssh/id_ed25519_key.pub + chmod 644 ~/.ssh/known_hosts + ``` + +### 2.4 Install Python Dependencies + +If your project requires Python packages: +1. **Navigate** to the project directory (e.g., `modulus-dls-api`). +2. **Install** the packages from local wheels: + ```bash + pip install --no-index --find-links=./offline_wheels -r requirements.txt + ``` + - `--no-index`: prevents pip from trying to reach PyPI. + - `--find-links=./offline_wheels`: points pip to your local folder of wheels. + +--- + +## 3. Run & Verify + +1. **Start the Modulus Container** + ```bash + cd project/modulus-dls-api/ + docker run --gpus all \ + --shm-size=1g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v "${PWD}:/workspace" \ + --name my_modulus_container \ + -it \ + nvcr.io/nvidia/modulus/modulus:24.12 bash + ``` + > **Note**: If you plan to run Jupyter inside the container and need to access it from your host, consider adding `-p 8888:8888`. + +2. **Install Additional Python Requirements** (if not done offline in step 2.4): + ```bash + pip install -r examples/cfd/darcy_autoML_active_learning/requirements.txt + ``` + +3. **Run Jupyter** (if desired): + ```bash + jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --no-browser + ``` + - Access it via `http://localhost:8888` on your host machine if you published port 8888 in the `docker run` command. + +4. **Stop / Restart Container** + - **Stop** container from inside: + ```bash + exit + ``` + - **Restart**: + ```bash + docker start my_modulus_container + docker exec -it my_modulus_container bash + ``` + +--- + +## 4. Final Notes + +- **File Hashes**: For better security, you may want to verify checksums (e.g., `sha256sum`) of the transferred `.deb` packages, Docker images, and archives. +- **Permissions**: Always confirm SSH folder and file permissions: + ```bash + chmod 700 ~/.ssh + chmod 600 ~/.ssh/id_ed25519_key + chmod 644 ~/.ssh/id_ed25519_key.pub + chmod 644 ~/.ssh/known_hosts + ``` +- **Dependencies**: If you encounter **dependency issues** while installing `.deb` packages, run: + ```bash + sudo apt-get install -f + ``` + or manually install the missing dependencies you also downloaded. +- **Updates**: If you need to update or install new packages, repeat the offline download/transfer process with the updated packages. +- **Modulus Versions**: This example references Modulus version **24.12**. Adjust if you need a different version. + +--- + +This **Appendix** should help guide you through setting up your environment **offline**. Make sure you have all required components downloaded and transferred before starting, and verify installation steps carefully at each stage. diff --git a/examples/cfd/darcy_autoML_active_learning/README.md b/examples/cfd/darcy_autoML_active_learning/README.md new file mode 100644 index 000000000..3888357e1 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/README.md @@ -0,0 +1,96 @@ +# Scalable PDE Surrogate Modeling: Darcy Flow with FNO/AFNO + AutoML & Active Learning + +## Prerequisites +For environment or Docker instructions, please consult **[GETTING_STARTED.md](./GETTING_STARTED.md)**. +After completing those steps, you can launch the Modulus container and run the notebooks by: +```bash +# If container isn't running, start it +docker start my_modulus_container +# Attach to it +docker exec -it my_modulus_container bash +# Move into the example folder +cd examples/cfd/darcy_autoML_active_learning +# Launch Jupyter (published on port 8888) +jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --no-browser +``` +Then open your browser at http://localhost:8888 (with the token link printed in the console), navigate to notebooks/darcy_autoML.ipynb or darcy_active_learning.ipynb. + +## Introduction +This repository demonstrates a **multi-faceted Physics-AI pipeline** for **Darcy Flow** PDE surrogate modeling. It features: +1. **A full data→model pipeline** for Darcy Flow surrogates (FNO/AFNO). +2. **AutoML** hyperparameter tuning (via Optuna or similar). +3. **Offline Active Learning** using MC-Dropout to identify high-uncertainty PDE samples. + +While Darcy Flow serves as our **example PDE**, the underlying **architecture** (modular \`src/\` code, notebooks, MLFlow integration) is designed to scale to **broader PDE problems** in engineering or scientific HPC workflows. + +## Notebooks Overview +1. **Notebook 1:** [darcy_autoML.ipynb](./notebooks/darcy_autoML.ipynb) + - **Introduction & Vision**: Explains the rationale for PDE-based surrogate modeling, plus how we can unify Darcy Flow, neural operators (FNO/AFNO), and an AutoML approach. + - **Data Generation & Loading**: Either synthetically produce Darcy Flow fields or load them from `.pt` files. + - **Surrogate Model Definition**: Demonstrates constructing a **FNOWithDropout** or an AFNO operator from the `src/models/` folder. + - **Hyperparameter Tuning (AutoML)**: Shows how to systematically search over PDE operator hyperparameters (modes, width, depth, etc.) for optimal results. + - **Training Execution**: A configurable training loop (`src/ModelTrainingLoop.py`) logs metrics (optionally to MLFlow), and can handle HPC or local usage. + - **Performance Visualization**: Minimal or extended visualization (train/val losses, PDE predictions). + +2. **Notebook 2:** [darcy_active_learning.ipynb](./notebooks/darcy_active_learning.ipynb) + - **Offline Active Learning**: Builds on the **trained PDE surrogate** from Notebook 1. + - **MC-Dropout** for Uncertainty: Multiple forward passes yield mean & variance for each PDE input. + - **Selecting Top-K**: Identifies which PDE fields are most “uncertain,” potentially requiring additional HPC solver runs or partial retraining. + - **Saving**: Optionally store the top-K uncertain samples in `.pt` format for further data augmentation. + +> **Note**: If you’d like more details on environment setup, Docker usage, or how to run these notebooks in a local vs. HPC scenario, see **[GETTING_STARTED.md](./GETTING_STARTED.md)**. + +## Repository Structure + +``` +darcy_autoML_active_learning/ +├─ notebooks/ +│ ├─ darcy_autoML.ipynb # Notebook 1: Surrogate + AutoML +│ ├─ darcy_active_learning.ipynb # Notebook 2: Offline AL with MC-Dropout +├─ src/ +│ ├─ darcy_automl_active_learning/ +│ │ └─ data_loading.py +│ ├─ models/ +│ │ └─ fno_with_dropout.py +│ │ ├─ model_factory.py +│ │ ├─ ModelTrainingLoop.py +│ ├─ automl +│ ├─ automl.py +│ ├─ AL/ +│ │ ├─ mc_dropout_estimator.py +│ │ └─ offline_al_demo.py (optional) +│ └─ visualization.py +├─ config/ +│ └─ config.yaml +├─ ... +└─ GETTING_STARTED.md +└─ requirements.txt +└─ pyproject.toml +``` + +### Key Highlights +- **Data & Surrogate**: Illustrates PDE data ingestion (e.g., uniform grids, Darcy2D) and operator-based networks (FNO, AFNO). +- **AutoML**: Uses a flexible search approach to optimize hyperparameters (learning rate, modes, etc.), easily extended to HPC or multi-GPU usage. +- **Active Learning**: Demonstrates a straightforward **offline** approach with MC-Dropout to rank PDE samples by uncertainty. +- **MLFlow**: Optionally logs training & tuning metrics. You can run `mlflow ui -p 2458` to visualize them in a local browser, or set up SSH port-forwarding on HPC. + +### Using MLFlow (Optional) +If you enable MLFlow logging (e.g., in `config.yaml` or directly in the notebook cells), you can **monitor training** or AL runs in real time: + +1. **Local**: + ```bash + mlflow ui -p 2458 + ``` + Then open [http://127.0.0.1:2458](http://127.0.0.1:2458). +2. **Remote HPC**: + - SSH with `-L 8080:127.0.0.1:8080` + - On remote: `mlflow ui --host 0.0.0.0 --port 8080` + - Local browser → `localhost:8080` + +See the examples in both notebooks for how to integrate MLFlow calls. + +### Next Steps & Vision +Both notebooks emphasize **scalability**: +- Additional PDEs (e.g., subgrid turbulence, multi-physics PDEs) can use the same pipeline with minor changes in data loading & operator choice. +- HPC synergy is achievable via distributed data generation or multi-GPU neural operator training. +- The entire approach can be integrated into a larger **Physics-AI** solution, combining an **ontology-based** data engine, advanced model selection, and real-time HPC embeddings. diff --git a/examples/cfd/darcy_autoML_active_learning/__init__.py b/examples/cfd/darcy_autoML_active_learning/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/config/config.yaml b/examples/cfd/darcy_autoML_active_learning/config/config.yaml new file mode 100644 index 000000000..2ecefb5cb --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/config/config.yaml @@ -0,0 +1,13 @@ +normaliser: + permeability: + mean: 1.0 + std_dev: 0.5 + darcy: + mean: 0.1 + std_dev: 0.05 + +training: + resolution: 64 + batch_size: 4 + max_pseudo_epochs: 10 + pseudo_epoch_sample_size: 512 diff --git a/examples/cfd/darcy_autoML_active_learning/docs/locating_paths.md b/examples/cfd/darcy_autoML_active_learning/docs/locating_paths.md new file mode 100644 index 000000000..fa706a4e1 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/docs/locating_paths.md @@ -0,0 +1,228 @@ +# **Locating the `darcy_autoML_active_learning` Directory: Detailed Scenarios** + +## Context +This document provides an overview of how we locate the `darcy_autoML_active_learning` directory and its subfolders across different environments (Docker vs. local usage). It describes our scenario-based approach in `path_utils.py` so the paths remain consistent regardless of how or where the code is run. + +We have a Python-based project that includes a folder named `darcy_autoML_active_learning`. Within this folder, we need to locate: + +1. **`darcy_project_root`** (the `darcy_autoML_active_learning` directory itself) +2. **`config_file`** (typically `darcy_project_root/config/config.yaml`) +3. **`data_dir`** (`darcy_project_root/data`) +4. **`results_dir`** (`darcy_project_root/results`) + +Additionally, we maintain a **`repo_root`** concept, which often represents the **root** of the entire repository (`modulus-dls-api/`). + +We face variations in environment (Docker vs. non-Docker), variations in how Docker is configured (workspace pointing directly to `modulus-dls-api/` or one directory above it), and whether or not `PROJECT_ROOT` is set. Our goal is to consistently identify the correct paths in all scenarios. + +--- + +## Overview + +In our `modulus-dls-api` repository, we rely on a subfolder named `darcy_autoML_active_learning` for crucial project components: + +- **`darcy_project_root`**: The `darcy_autoML_active_learning` directory itself. +- **`config_file`**: Stored under `darcy_project_root/config/config.yaml`. +- **`data_dir`**: `darcy_project_root/data`. +- **`results_dir`**: `darcy_project_root/results`. + +Additionally, there is a concept of **`repo_root`**, which often represents the top-level repository directory (e.g., `modulus-dls-api/`). We must unify how these paths are identified under a variety of conditions, including: + +- Docker vs. local environments. +- Different Docker workspace directories (`modulus-dls-api/` vs. one level above). +- Presence or absence of the `PROJECT_ROOT` environment variable in Docker. +- Local Jupyter notebooks started in various directories. + +--- + +## **1. Project Structure (Illustrative)** + +A simplified view of the repository might look like this: + +``` +modulus-dls-api/ +├─ examples/ +│ └─ cfd/ +│ └─ darcy_autoML_active_learning/ +│ ├─ config/ +│ │ └─ config.yaml +│ ├─ data/ +│ ├─ results/ +│ ├─ notebooks/ +│ │ └─ darcy_active_learning.ipynb +│ └─ src/ +│ └─ darcy_automl_active_learning/ +│ └─ path_utils.py +└─ ... +``` + +--- + +## **2. General Requirements** + +1. **Docker Usage** + - We may (or may not) have `PROJECT_ROOT` set as an environment variable. + - The “workspace” (the Docker working directory) could be either exactly `modulus-dls-api/` or one directory above it, such as `/home/user/`. + - When the workspace is **exactly** `modulus-dls-api/`, we want to treat it as `"."`. + +2. **Local (Non-Docker) Usage** + - We do **not** rely on `PROJECT_ROOT`. + - We assume the Jupyter Notebook server can be launched from anywhere (the top-level repo directory or deeper inside). + - We do **not** trust the current working directory to always be stable. Instead, we rely on obtaining the absolute path of a Python file (like `path_utils.py`) and navigating from there. + +3. **Desired Path Forms** + - In Docker (when the workspace is the repo root), we prefer to treat that directory as `"."` so that subdirectories appear as `"./examples"`, `"./data"`, etc. + - In local usage, we also prefer relative paths if possible—but we’ll figure them out by code that references `path_utils.py`. + +--- + +## **3. Enumerated Scenarios** + +Below are **eight** scenarios, reflecting Docker vs. non-Docker, plus the presence or absence of `PROJECT_ROOT`, plus the two workspace configurations in Docker. + +### **Docker: Workspace = `modulus-dls-api/`** + +1. **Scenario A1**: Docker, **workspace** = `modulus-dls-api/`, **`PROJECT_ROOT` is set** +2. **Scenario A2**: Docker, **workspace** = `modulus-dls-api/`, **`PROJECT_ROOT` is not set** + +### **Docker: Workspace = one directory above** (e.g. `/home/user/`) + +3. **Scenario B1**: Docker, **workspace** = one level above `modulus-dls-api/`, **`PROJECT_ROOT` is set** +4. **Scenario B2**: Docker, **workspace** = one level above `modulus-dls-api/`, **`PROJECT_ROOT` is not set** + +### **Local Usage (No Docker)** + +Here, we assume `PROJECT_ROOT` is **never** set. + +5. **Scenario C1**: Local usage, Jupyter is started in `~/project/modulus-dls-api/` +6. **Scenario C2**: Local usage, Jupyter is started in `~/project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning/` + - We might further vary how many directories we are above or below the top-level. For simplicity, we just illustrate these two. + +*(You may or may not need to further expand local usage scenarios, but these are the main ones we foresee.)* + +--- + +## **4. Desired Paths in Each Scenario** + +Our code must reliably return the following **five** paths: + +1. **`repo_root`** + - Often `"."` if the Docker workspace matches the top-level repo or if the user is already in `modulus-dls-api/`. + - Could be `"./modulus-dls-api"` if the workspace is one directory above. + +2. **`darcy_project_root`** + - Typically `repo_root/examples/cfd/darcy_autoML_active_learning`. + +3. **`config_file`** + - Typically `darcy_project_root/config/config.yaml`. + +4. **`data_dir`** + - `darcy_project_root/data`. + +5. **`results_dir`** + - `darcy_project_root/results`. + +Regardless of environment, these paths should always point to the correct directories/files for the `darcy_autoML_active_learning` project. + +This section contains **one table per scenario**, listing how each path (`repo_root`, `darcy_project_root`, `config_file`, `data_dir`, `results_dir`) should look in code. + +### **Scenario A1**: Docker, Workspace = `modulus-dls-api/`, `PROJECT_ROOT` **is set** + +Even though `PROJECT_ROOT` is set, we consider the workspace (`modulus-dls-api/`) as `"."`. Therefore, **we want**: + +| **Path Variable** | **Desired Value** | +|-----------------------|-----------------------------------------------------------------------------------------------| +| `repo_root` | `.` | +| `darcy_project_root` | `./examples/cfd/darcy_autoML_active_learning` | +| `config_file` | `./examples/cfd/darcy_autoML_active_learning/config/config.yaml` | +| `data_dir` | `./examples/cfd/darcy_autoML_active_learning/data` | +| `results_dir` | `./examples/cfd/darcy_autoML_active_learning/results` | + +*(Note: If you **do** want the code to reflect the environment variable’s absolute path, you’d see something like `/workspace/modulus-dls-api`. But you explicitly stated you prefer `.`. This implies code that normalizes or collapses the absolute path to `"."` if it matches the workspace.)* + +### **Scenario A2**: Docker, Workspace = `modulus-dls-api/`, `PROJECT_ROOT` **is not set** + +Now there is no environment variable. The code sees it’s in `modulus-dls-api/`. We want: + +| **Path Variable** | **Desired Value** | +|-----------------------|-----------------------------------------------------------------------------------------------| +| `repo_root` | `.` | +| `darcy_project_root` | `./examples/cfd/darcy_autoML_active_learning` | +| `config_file` | `./examples/cfd/darcy_autoML_active_learning/config/config.yaml` | +| `data_dir` | `./examples/cfd/darcy_autoML_active_learning/data` | +| `results_dir` | `./examples/cfd/darcy_autoML_active_learning/results` | + +### **Scenario B1**: Docker, Workspace = one level above (e.g., `/home/user/`), `PROJECT_ROOT` **is set** + +Now, `PROJECT_ROOT` might be `"/home/user/project/modulus-dls-api"` or similar. The code can detect that path, but how do we **want** the final variables to look? + +Assume we want them to be **relative** to `project/modulus-dls-api/`, but still displayed in a “nice” manner. Possibly: + +| **Path Variable** | **Desired Value** | +|-----------------------|-----------------------------------------------------------------------------------------------| +| `repo_root` | `./project/modulus-dls-api` (or maybe still `.` if we want to pretend we’re in `modulus-dls-api`) | +| `darcy_project_root` | `./project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning` | +| `config_file` | `./project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning/config/config.yaml` | +| `data_dir` | `./project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning/data` | +| `results_dir` | `./project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning/results` | + +If you actually want to **collapse** it to `.` meaning `repo_root` is literally `.` from the perspective of Docker, that implies the code detects `"/home/user"` as the workspace but sees `PROJECT_ROOT = "/home/user/project/modulus-dls-api"` and then normalizes it. It’s up to the final design. + +### **Scenario B2**: Docker, Workspace = one level above, `PROJECT_ROOT` **is not set** + +No environment variable is set, but the user is currently at `/home/user/`. If we rely on the fallback “current working directory is `.`,” then: + +| **Path Variable** | **Desired Value** | +|-----------------------|------------------------------------------------------------------------------------------------------------------| +| `repo_root` | `.` (which is `/home/user` in reality) | +| `darcy_project_root` | (Potentially) `./modulus-dls-api/examples/cfd/darcy_autoML_active_learning` | +| `config_file` | `./modulus-dls-api/examples/cfd/darcy_autoML_active_learning/config/config.yaml` | +| `data_dir` | `./modulus-dls-api/examples/cfd/darcy_autoML_active_learning/data` | +| `results_dir` | `./modulus-dls-api/examples/cfd/darcy_autoML_active_learning/results` | + +*(We might prefer an error in this scenario if we think it’s invalid for the user to be in a directory above the repo without `PROJECT_ROOT`. Or we might let it proceed with a relative path that includes `modulus-dls-api/` as a subfolder. This is part of the final design to be discussed.)* + +### **Scenario C1**: Local, Jupyter started in `~/project/modulus-dls-api/` (no Docker, no `PROJECT_ROOT`) + +We do **not** rely on environment variables. We discover `.` is the top-level repo. We prefer: + +| **Path Variable** | **Desired Value** | +|-----------------------|-----------------------------------------------------------------------------------------------| +| `repo_root` | `.` | +| `darcy_project_root` | `./examples/cfd/darcy_autoML_active_learning` | +| `config_file` | `./examples/cfd/darcy_autoML_active_learning/config/config.yaml` | +| `data_dir` | `./examples/cfd/darcy_autoML_active_learning/data` | +| `results_dir` | `./examples/cfd/darcy_autoML_active_learning/results` | + +### **Scenario C2**: Local, Jupyter started in `~/project/modulus-dls-api/examples/cfd/darcy_autoML_active_learning/` + +We are already inside `darcy_autoML_active_learning`. We might choose: + +| **Path Variable** | **Desired Value** | +|-----------------------|-----------------------------------| +| `repo_root` | `..` (meaning “one directory up”) or maybe you want to call it `../../..` if you define the top-level differently | +| `darcy_project_root` | `.` (since we are already in `darcy_autoML_active_learning`) | +| `config_file` | `./config/config.yaml` | +| `data_dir` | `./data` | +| `results_dir` | `./results` | + +*(This depends on whether you define “repo root” as the top-level `modulus-dls-api/` or if you define `darcy_autoML_active_learning` itself as a “root.” This is part of the final design decision. If you do consider `modulus-dls-api` the real root, then `repo_root` might be `../../..`. If you consider the `darcy_autoML_active_learning` folder to be the root of the sub-project, then `.` is your root. Either approach can work.)* + +--- + +## 5. Implementation Approach + +We use a **scenario-based** method in `path_utils.py`: + +1. **Identify Scenario**: + - `identify_scenario()` checks whether we are in Docker, whether `PROJECT_ROOT` is set, and (optionally) the Docker “workspace” location. It returns a code like `"A1"` or `"B2"`. + +2. **Scenario-Specific Functions**: + - For each recognized scenario (e.g., `get_paths_for_A1()`), we specify exactly how `repo_root` is determined and how subpaths (`darcy_project_root`, `config_file`, etc.) are built. + +3. **Public Entry Point**: + - `get_paths()` calls `identify_scenario()` and dispatches to the matching function. It returns a tuple (or dictionary) containing `(repo_root, darcy_project_root, config_file, data_dir, results_dir)`. + +4. **Fallback / Not Implemented**: + - If a scenario is not yet implemented, we raise `NotImplementedError`. + +This structure keeps code straightforward: each scenario is in its own function, and the `identify_scenario()` function is small and focused. \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/docs/python_package_setup.md b/examples/cfd/darcy_autoML_active_learning/docs/python_package_setup.md new file mode 100644 index 000000000..86b9e7f91 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/docs/python_package_setup.md @@ -0,0 +1,167 @@ +# Darcy AutoML & Active Learning: Python Package Setup + +This document explains how to set up and install your **Darcy autoML & active learning** code as a **pip-installable Python package**, ensuring smooth imports in both Docker and local environments. + +--- + +## 1. Overview + +Historically, the code for Darcy PDE, FNO-based autoML, and active learning was mixed into notebooks with relative imports or `sys.path` hacks. We now transition to a **proper Python package** located under `src/`. This allows: + +- **Clean imports** in notebooks: `import darcy_automl_active_learning` +- **Editable installs** (`pip install -e .`), so code changes reflect immediately +- Compatibility with both **Docker** (via your `GETTING_STARTED.md` steps) and **local** usage + +--- + +## 2. Directory Structure + +Below is an example folder layout under `modulus-dls-api/examples/cfd/darcy_autoML_active_learning/`: + +``` +darcy_autoML_active_learning/ +├─ notebooks/ +│ ├─ darcy_autoML.ipynb +│ ├─ darcy_active_learning.ipynb +├─ src/ +│ └─ darcy_automl_active_learning/ +│ ├─ __init__.py +│ ├─ AutoMLCandidateModelSelection.py +│ ├─ data_desc_logic.py +│ ├─ ModelTrainingLoop.py +│ ├─ ... +├─ config/ +│ └─ config.yaml +├─ pyproject.toml +├─ requirements.txt +└─ README.md +``` + +### Key Points + +1. **Source code** goes in `src/darcy_automl_active_learning/`. +2. **`pyproject.toml`** defines the package name, version, and dependencies. +3. **Notebooks** now do imports like `from darcy_automl_active_learning import data_desc_logic` instead of relative path fiddling. + +--- + +## 3. Minimal `pyproject.toml` + +Below is an illustrative example: + +```toml +[build-system] +requires = ["setuptools>=60.2.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "darcy-automl-active-learning" +version = "0.1.0" +description = "Darcy PDE example with FNO-based autoML and active learning" +authors = [{ name = "YourName" }] +readme = "README.md" +license = { text = "Apache-2.0" } +dependencies = [ + "optuna==4.1.0", + "mlflow>=2.1.1", + "tqdm>=4.66.5" +] +# or any other core Python deps + +[tool.setuptools.packages.find] +where = ["src"] +``` + +#### Explanation + +- **`where = ["src"]`**: Tells setuptools to find packages under `src/`. +- **`dependencies = [...]`**: List your core Python dependencies (originally in `requirements.txt`). You can keep them pinned or flexible. +- **`name = "darcy-automl-active-learning"`**: The package’s name as PyPI would see it (not strictly used locally, but important to identify the package). + +--- + +## 4. Installing the Package + +### 4.1 Docker or Local + +From `examples/cfd/darcy_autoML_active_learning/`, run: + +```bash +pip install -e . +``` + +**`-e .`** (editable mode) installs a link to your local `src/darcy_automl_active_learning` code. If you edit `.py` files, those changes appear the next time you run or reload code—no need to reinstall. + +### 4.2 Verification + +```python +import darcy_automl_active_learning +print(darcy_automl_active_learning.__version__) +``` + +If the import and version print work, your environment is properly set up. + +No more `ModuleNotFoundError` or `sys.path` tweaks! + +--- + +## 5. Usage in Jupyter Notebooks + +Once installed, your notebooks can simply do: + +```python +from darcy_automl_active_learning.AutoMLCandidateModelSelection import ( + automl_candidate_model_selection, + save_candidate_models +) + +candidates = automl_candidate_model_selection(...) +``` + +Or: + +```python +from darcy_automl_active_learning import ModelTrainingLoop + +ModelTrainingLoop.run_modulus_training_loop(cfg, model, ...) +``` + +No local path hacks needed—Python sees your package like any other installed module. + +--- + +## 6. HPC / Docker Workflow + +1. **Mount or clone** your repository so Docker or HPC sees the `darcy_autoML_active_learning` folder. +2. **`pip install -e .`** from that folder. +3. **Run** your notebooks or scripts. They do `import darcy_automl_active_learning.*` without issues. + +If using Docker, you can integrate this step into your `Dockerfile` or do it manually after starting the container. + +--- + +## 7. Rationale & Benefits + +1. **Cleaner Imports**: No need for `sys.path.append(...)`. +2. **Editable Installs**: Continuous development—edits in `.py` files reflect immediately. +3. **Package Organization**: Encourages splitting large notebooks into reusable modules (training loops, data logic, etc.). +4. **Easier Distribution**: You could upload this package to a private PyPI or share a wheel if needed. + +--- + +## 8. Common Issues / FAQ + +### **ModuleNotFoundError** + +- **Cause**: You forgot to run `pip install -e .`, or your directory name/package name changed. +- **Fix**: Verify `pyproject.toml`’s `[project] name = ...` and `where = ["src"]`, then reinstall with `-e`. + +### **Edits Not Reflecting** + +- **Cause**: You installed normally (not in editable mode), or Jupyter is caching. +- **Fix**: Ensure you used `-e .`. In Jupyter, you can `%load_ext autoreload` and `%autoreload 2` for dynamic reloading of modules. + +### **Dependency Conflicts** + +- **Cause**: You pinned dependencies in `pyproject.toml` that conflict with an existing environment. +- **Fix**: Either loosen version pins, or manage them in a separate environment. For advanced usage, consider tools like `poetry` or `conda`. diff --git a/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_active_learning.ipynb b/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_active_learning.ipynb new file mode 100644 index 000000000..b46862e4d --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_active_learning.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction and Context\n", + "This notebook extends our previous **Darcy Flow FNO** prototype by illustrating an **offline active learning** (AL) workflow. We assume we have a **trained PDE surrogate**—in particular, a dropout-enabled model (e.g., `FNOWithDropout`). The essence of **offline AL** is to identify “where the model is most uncertain” and select those PDE inputs for additional simulation, partial retraining, or deeper inspection. We’ll use **MC-Dropout** to measure the variance of multiple stochastic forward passes, guiding us to the PDE fields that are likely the hardest or “least understood” by the current model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "1. [Introduction and Context](#introduction-and--context) \n", + " - Explains how **offline active learning** builds upon the **trained PDE surrogate** from the first notebook. \n", + " - Summarizes the concept of using **MC-Dropout** to identify uncertain PDE samples.\n", + "\n", + "2. [AL_01 Setup & Configuration](#al_01-setup-and-configuration) \n", + " - Loads `config.yaml` or Hydra configuration for AL (checkpoint paths, dropout rate, num of MC passes, etc.). \n", + " - Imports necessary libraries (`torch`, `Darcy2D`, `MCDropoutEstimator`, etc.).\n", + "\n", + "3. [AL_02 Model Loading](#al_02-model-loading) \n", + " - Constructs the **dropout-enabled** operator (e.g., `FNOWithDropout`) with matching hyperparams from the prior notebook. \n", + " - Loads the trained checkpoint (`.pt`), sets device (`cpu` or `cuda`).\n", + "\n", + "4. [AL_03 Candidate Data Generation](#al_03-candidate-data-generation) \n", + " - Illustrates generating a batch of PDE inputs (e.g., via `Darcy2D`) for the AL procedure. \n", + " - Optionally visualizes or logs the shape/stats of these candidate fields.\n", + "\n", + "5. [AL_04 MC-Dropout Uncertainty Estimation](#al_04-mc-dropout-uncertainty-estimation) \n", + " - Explains how **MCDropoutEstimator** runs multiple stochastic forward passes. \n", + " - Computes mean and variance across passes for each candidate PDE input, storing them in `mean_pred` and `var_pred`.\n", + "\n", + "6. [AL_05 Selecting Top-K Uncertain Samples](#al_05-selecting-top-k-uncertain-samples) \n", + " - Aggregates uncertainty (e.g., averaging `var_pred` across spatial dims). \n", + " - Picks the top-K uncertain PDE samples, logs or prints them for clarity.\n", + "\n", + "7. [AL_06 Save & (Optionally) Retrain](#al_06-save--optionally-retrain) \n", + " - Creates a `.pt` dictionary with the chosen PDE inputs (and predictions) for potential partial retraining or HPC PDE solves. \n", + " - Mentions how we’d add these new PDE fields back into the main dataset.\n", + "\n", + "8. [AL_07 Optional Repeated AL Loop](#al_07-optional-repeated-al-loop) \n", + " - Summarizes how we could iterate the procedure: generate → pick top-K → add to dataset → retrain → repeat. \n", + " - More relevant for a “live” or multi-step AL scenario.\n", + "\n", + "10. [Conclusion & Next Steps](#conclusion--next-steps) \n", + " - Recaps the **offline AL** approach for Darcy Flow. \n", + " - Hints at advanced usage: HPC parallel generation, repeated AL loops, or integration with the first notebook’s AutoML pipeline.\n", + "\n", + "> **Reference**: For data generation, model building, or hyperparameter tuning steps, see our **[Main Notebook](darcy_autoML.ipynb)**. This AL notebook specifically focuses on uncertainty-driven sample selection once a dropout-based PDE surrogate is trained." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AL_01 Setup and Configuration\n", + "In this section, we **load our AL-specific configuration** (e.g., checkpoint paths, dropout rate, number of Monte Carlo passes, and the top-K selection size). We also import essential dependencies such as PyTorch, the Darcy PDE generator, and our custom `MCDropoutEstimator`. This step ensures all parameters are aligned with the environment used in the previous notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_02 Model Loading\n", + "Here, we **construct the dropout-based PDE surrogate** (e.g., `FNOWithDropout`) with the same hyperparameters used in training, then **load the trained checkpoint**. We switch the model to `.eval()` mode but keep dropout layers “active” for the MC passes. This stage verifies the model’s integrity and device setup before proceeding to uncertainty estimation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_03 Candidate Data Generation\n", + "Next, we demonstrate how to **generate a batch of PDE inputs** (e.g., using the `Darcy2D` data loader) for the AL procedure. We pull a single batch (or more, if needed) of permeability fields, which serve as our “candidates.” Optionally, we may visualize or summarize these input tensors (e.g., shape checks) to confirm correctness." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_04 MC-Dropout Uncertainty Estimation\n", + "This section details the **core of the AL process**—**MC-Dropout**. Using `MCDropoutEstimator`, we run multiple forward passes on each candidate PDE input, capturing **mean** and **variance** predictions for each sample. This reveals how “confident” or “uncertain” the surrogate is about each PDE field. The resulting arrays `mean_pred` and `var_pred` contain our per-sample signals for subsequent top-K selection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_05 Selecting Top-K Uncertain Samples\n", + "We now **aggregate** the voxel-wise variances (e.g., average them spatially) to get a single scalar “uncertainty” per candidate sample. We then **pick the top-K** PDE fields where the model is least certain—those presumably requiring additional solver runs or retraining data. This section prints or logs the top uncertain items, helping you see which PDE inputs are outliers or otherwise uncharted territory for the current surrogate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_06 Save & (Optionally) Retrain\n", + "Once we have the top-K uncertain samples, we can **save them** into a `.pt` dictionary (including their input fields, mean predictions, and variance maps). In a subsequent step—possibly in the first notebook’s pipeline or HPC environment—these PDE samples could be **added** to the training set for partial retraining. This ensures future surrogates become more robust in precisely the scenarios that cause uncertainty today." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### AL_07 Optional Repeated AL Loop\n", + "Here, we highlight how to **cycle** the offline AL approach for multiple rounds—**generate → pick top-K → retrain → repeat**. Although this notebook focuses on a single demonstration pass, more advanced workflows might iterate until the PDE surrogate’s uncertainty (and validation error) converge to acceptable levels. This approach can be integrated with HPC or a live PDE solver to refine the surrogate in real-time.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### Conclusion & Next Steps\n", + "Finally, we **recap** how offline AL, powered by **MC-Dropout**, helps prioritize PDE samples for further simulation or partial retraining. With minimal overhead, you can systematically hone the surrogate where it struggles most. Going forward, you might incorporate HPC parallel data generation, or weave AL into the **[Main Notebook](darcy_autoML.ipynb)** pipeline for a cohesive PDE–AI ecosystem. This ensures an ever-improving operator model, driven by uncertainty-driven sample additions." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_autoML.ipynb b/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_autoML.ipynb new file mode 100644 index 000000000..8abd611cf --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/notebooks/darcy_autoML.ipynb @@ -0,0 +1,1614 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Introduction & Vision\n", + "Welcome to this **prototype notebook**, which illustrates a **foundational workflow** for **Physics-AI** modeling, using **Darcy Flow** (and an **FNO** approach) as a **concrete example**. The goals of this notebook are **multi-faceted**:\n", + "\n", + "1. **Showcase a workflow** for **Physics-AI training** with **Darcy Flow** data, emphasizing how easily we can integrate **FNO** models in **NVIDIA Modulus** or similar frameworks. \n", + "2. **Demonstrate a general-purpose AutoML approach**—one that systematically searches for optimal **hyperparameters** (learning rate, channels, modes, etc.) across *any* PDE or neural operator. \n", + "3. **Preview Active Learning** as a complementary strategy to guide data acquisition based on model **uncertainty**. (In this notebook, I’ll outline how to do it; the *actual active learning code* is placed in a second notebook, [`darcy_active_learning.ipynb`](darcy_active_learning.ipynb).)\n", + "\n", + "Beyond these **current technical** accomplishments, this notebook also hints at **inspirational** next steps—extending from a **single PDE** toward a broad **Physics-AI solution** that includes:\n", + "\n", + "#### I. Technical (Software Engineering)\n", + "- **Ontology-based data transformations**: A structured, automated pipeline for bridging different PDE data shapes (mesh ↔ grid ↔ point cloud). It reduces manual conversions and helps unify HPC outputs (e.g., AMReX) with ML-ready arrays. \n", + "- **Ontology engine**: A framework that **detects** dataset geometry (e.g., uniform grid vs. unstructured mesh) and picks the right operator or transformation step. This paves the way for “one-click” PDE model building, especially when integrated with AutoML. \n", + "- **AutoML for candidate selection**: Not just hyperparameters, but also *which* neural operator (FNO, AFNO, WNO, PINNs, etc.) is best for a given domain geometry or PDE. This accelerates experimentation by automatically ranking model architectures. \n", + "- **Advanced workflow, pipelines, and training**: Encompasses **HPC synergy** (ingesting large or partially refined HPC data), orchestrated pipelines (e.g., Kedro, Airflow), and **accelerated PDE surrogate training** (multi-GPU, distributed). Together, these let us efficiently handle time-evolving PDE snapshots and large-scale parameter sweeps in real engineering environments.\n", + "\n", + "#### II. Model & Physics Content\n", + "- **Models: WNO, NUFNO, etc.**:\n", + " Beyond standard FNO, options like **Wavelet Neural Operator (WNO)** capture sharper local features via wavelet transforms, while **Non-Uniform FNO (NUFNO)** accommodates partial refinements or semi-structured domains. These advanced architectures can improve accuracy without a complete shift to graph-based methods.\n", + "\n", + "- **Customized operator designs**:\n", + " Domain-specific enhancements—e.g., a plasma-tailored FNO or specialized boundary treatments—boost performance on PDEs with unique constraints (sharp separatrices, anisotropy). This ensures surrogates match real-world physics more precisely than generic operators.\n", + "\n", + "- **Ensembles / partial refinement & local closures**:\n", + " In HPC settings with variable mesh refinement or subgrid phenomena, a hybrid approach (e.g., FNO + DiffusionNet) can handle global PDE patterns while focusing local operators on high-resolution patches. This preserves large-scale coverage and detail where it matters most.\n", + "\n", + "- **Multi-scale patterns** \n", + " Many PDEs combine broad wave modes with fine-edged phenomena (e.g., subgrid turbulence). Leveraging wavelet-based or ensemble architectures means each scale can be tackled effectively—ensuring no critical features get lost.\n", + "\n", + "- **Multi-physics regimes**:\n", + " Real engineering tasks often blend multiple physics (e.g., fluid–structure interaction, electromagnetic–thermal coupling). By composing or extending neural operators for each sub-physics domain, we can solve coupled PDE sets under one pipeline.\n", + "\n", + "- **Physics-Informed Loss (added to current physics-informed architecture)**:\n", + " Incorporating PDE constraints directly into the training objective ensures surrogates adhere to known physics. This is invaluable for **inverse problem solving** (where data can be sparse) and for overall stability/robustness when extrapolating to new parameter regimes.\n", + "\n", + "#### III. Downstream Applications\n", + "- **Inverse Problem Solving**:\n", + " Quickly invert PDE relationships to find **which input conditions** yield a desired output (e.g., *some configuration for a target outcome value*). This drastically reduces design-cycle times compared to iterative HPC solves.\n", + "\n", + "- **Optimization**:\n", + " Plug surrogates into parametric optimization loops (shape optimization, operational parameter tuning). The surrogate’s fast inference replaces expensive HPC calls at each iteration, speeding up design exploration.\n", + "\n", + "- **Deployment, HPC workflows, and MLOps**:\n", + " Once the model is trained, seamlessly **deploy** it alongside HPC codes for real-time PDE updates, controlling or monitoring processes. MLOps features (monitoring, versioning) ensure reliability, traceability, and easy model updates in production or research HPC clusters.\n", + "\n", + "I’m calling it a **“kick-off”** project because, even though it’s built around Darcy Flow and FNO, the underlying design can **readily scale**—both in terms of PDE complexity (multi-scale turbulence, advanced HPC data) and in terms of **workflow** (AutoML, HPC integration, interactive active learning, etc.). By adopting these modular components, we set the stage for a future in which **Physics-AI** model development becomes more automated, adaptable, and robust—serving a wide range of scientific and engineering challenges." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "1. [00_Generate_Data](#00_generate_data) \n", + " - [00_01 Darcy Flow Simulation + Data Descriptor Creation](#00_01-darcy-flow-simulation--data-descriptor-creation) \n", + " - Demonstrates **generating synthetic Darcy flow data** (via `Darcy2D`) and creating a **data descriptor**. This lays the groundwork for PDE data ingestion, transformations, and future AutoML usage.\n", + "\n", + "2. [01_Build_Surrogate_Model](#01_build_surrogate_model) \n", + " - [01_00 AutoMLCandidateModelSelection](#01_00-automl-candidate-model-selection)\n", + " - **Motivation**: Showcases how we determine which PDE surrogate model(s) might work best given a dataset descriptor. \n", + " - **01_00_01 Data Descriptor & Model Registry Initialization** \n", + " - Loads the generated data descriptor, creates or loads a `ModelRegistry` with metadata for multiple models (FNO, AFNO, DiffusionNet, etc.). \n", + " - **01_00_02 Candidate Model Selection & Validation** \n", + " - Validates descriptor, applies selection logic (e.g., simple rules or advanced ranking), and saves the selected model(s) for downstream pipeline steps.\n", + "\n", + " - [01_01 Data Loading and (Optional) Data Transformation](#01_01-data-loading-and-optional-data-transformation) \n", + " - Covers loading raw `.pt` files or synthetic data, plus transformations like normalization or boundary labeling. \n", + " - [01_01_01 LoadRawData](#01_01_01-loadrawdata) \n", + " - Shows how `.pt` data is read, with minimal Exploratory Data Analysis (EDA). \n", + " - [01_01_03 TransformRawData](#01_01_03-transformrawdata) \n", + " - Applies any coordinate expansions, normalization, or shape fixes. \n", + " - [01_01_04 Preprocessing](#01_01_04-preprocessing) \n", + " - Optional steps for data quality checks or outlier removal. \n", + " - [01_01_05 FeaturePreparation](#01_01_05-featurepreparation) \n", + " - Final feature engineering, e.g., boundary-channel additions.\n", + "\n", + " - [01_02 Model Definition](#01_02-model-definition) \n", + " - **Implements** the PDE surrogate networks (e.g., `FNOWithDropout`, AFNO). Explains class architecture and relevant config fields.\n", + "\n", + " - [01_03 Model Factory](#01_03-model-factory) \n", + " - Demonstrates a single function `get_model(cfg)` that returns a chosen operator based on `model_name` in the config.\n", + "\n", + " - [01_04 Configuring Hyperparameters](#01_04-configuring-hyperparameters) \n", + " - Discusses reading or overriding hyperparams (Fourier modes, widths, learning rate, etc.) from `config.yaml`. Also references HPC or local usage.\n", + "\n", + " - [01_05 Model Training Loop](#01_05-model-training-loop) \n", + " - Outlines the core training logic: optimizer, loss, epoch iteration, logging (potentially with MLFlow).\n", + "\n", + " - [01_06 Model Training Execution](#01_06-model-training-execution) \n", + " - **Brings it together**: builds a `model`, obtains a `dataloader`, and runs the main training loop. For example:\n", + " ```python\n", + " model = get_model(cfg)\n", + " train_loader = get_darcy_data_loader(cfg)\n", + " final_val_loss = run_modulus_training_loop(cfg, model, train_loader)\n", + " ```\n", + " - Presents how we might do single-run or multi-model iteration.\n", + "\n", + " - [01_07 AutoML and Hyperparameter Tuning](#01_07-automl-and-hyperparameter-tuning) \n", + " - Demonstrates **Optuna** or similar libraries for PDE hyperparameter search (e.g., `modes`, `width`, `depth`, `lr`). Also covers multi-model tuning (FNO vs. AFNO).\n", + "\n", + " - [01_08 Visualizing Performance and Results](#01_08-visualizing-performance-and-results) \n", + " - Shows how to **plot** training/validation curves or produce PDE field comparisons (predicted vs. ground truth). Possibly lists best trials from AutoML.\n", + "\n", + "3. [Offline Active Learning (Short Overview)](#offline-active-learning-short-overview)\n", + " - **Note**: Active Learning steps (MC-Dropout for uncertain PDE samples) are covered in a separate notebook. We only summarize here. \n", + " - [Active Learning Notebook](darcy_active_learning.ipynb) — The second file demonstrates:\n", + " 1. Loading a **dropout-enabled** operator,\n", + " 2. Running multiple forward passes for uncertainty,\n", + " 3. Selecting top-K uncertain PDE inputs,\n", + " 4. (Optionally) saving them for partial retraining or HPC PDE solves.\n", + "\n", + "> *If you only need to see the AL approach, jump directly to [Active Learning Notebook](darcy_active_learning.ipynb).* This first notebook focuses on data generation, model building, and AutoML. " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(level=logging.WARNING) # logging.basicConfig(level=logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from omegaconf import OmegaConf\n", + "from darcy_automl_active_learning.path_utils import get_paths\n", + "\n", + "repo_root_path, darcy_project_root_path, config_file_path, data_dir_path, results_dir_path = get_paths()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 00_01 Darcy Flow Simulation & Data Descriptor Creation\n", + "\n", + "In this section, we will **generate synthetic Darcy flow data** using `Darcy2D` (part of the NVIDIA Modulus datapipe utilities) and **document** the resulting dataset via a standardized **data descriptor**. The data descriptor follows our **physics-AI data taxonomy**, ensuring we capture crucial fields like dimensionality, geometry type, uniformity, and so forth.\n", + "\n", + "1. **Generate Data** \n", + " - We will create multiple `.pt` files containing the Darcy field samples (`permeability` and `darcy`), using the `Darcy2D` datapipe for 2D PDE generation. \n", + " - These files are placed in the `data/00_Generate_Data/` folder.\n", + "\n", + "2. **Load Configuration** \n", + " - We leverage a `config.yaml` that provides hyperparameters for data generation: resolution, batch size, normalizer values, etc.\n", + "\n", + "3. **Create a Data Descriptor** \n", + " - After data generation, we write out a JSON file (e.g., `data_desc.json`) describing the dataset’s structure. This descriptor conforms to our **Core Taxonomy & Ontology** for PDE data. For a 2D uniform grid, we specify fields like `\"dimension\": 2`, `\"geometry_type\": \"grid\"`, `\"uniform\": true`, and the channel layout." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[GenerateData] Initializing Darcy2D data loader...\n", + "[GenerateData] Generating and saving 20 batch files to 'examples/cfd/darcy_autoML_active_learning/data/00_Generate_Data' ...\n", + "Module modulus.datapipes.benchmarks.kernels.initialization 0f5b36a load on device 'cuda:0' took 48.97 ms (cached)\n", + "Module modulus.datapipes.benchmarks.kernels.utils 27df179 load on device 'cuda:0' took 301.55 ms (cached)\n", + "Module modulus.datapipes.benchmarks.kernels.finite_difference d7632e6 load on device 'cuda:0' took 154.55 ms (cached)\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import torch\n", + "from omegaconf import OmegaConf\n", + "\n", + "# If you have Modulus installed:\n", + "try:\n", + " from modulus.datapipes.benchmarks.darcy import Darcy2D\n", + "except ImportError:\n", + " Darcy2D = None\n", + " print(\"[Warning] 'modulus.datapipes.benchmarks.darcy' not found. Please ensure NVIDIA Modulus is installed.\")\n", + "\n", + "SKIP_EXISTING = True # True -> won't overwrite existing .pt files\n", + "\n", + "# 1) Load config from 'config_file' (path to your config.yaml)\n", + "if not os.path.exists(config_file_path):\n", + " print(f\"[GenerateData] config.yaml not found at {config_file_path}; using fallback defaults.\")\n", + " cfg = OmegaConf.create({\n", + " \"normaliser\": {\n", + " \"permeability\": {\"mean\": 1.0, \"std_dev\": 0.5},\n", + " \"darcy\": {\"mean\": 0.1, \"std_dev\": 0.05}\n", + " },\n", + " \"training\": {\n", + " \"resolution\": 64,\n", + " \"batch_size\": 4,\n", + " \"max_pseudo_epochs\": 10,\n", + " \"pseudo_epoch_sample_size\": 512\n", + " }\n", + " })\n", + "else:\n", + " cfg = OmegaConf.load(config_file_path)\n", + "\n", + "# 2) Construct normaliser info if config.yaml has mean/std for 'permeability' / 'darcy'\n", + "norm_cfg = cfg.normaliser\n", + "normaliser = {\n", + " \"permeability\": (norm_cfg.permeability.mean, norm_cfg.permeability.std_dev),\n", + " \"darcy\": (norm_cfg.darcy.mean, norm_cfg.darcy.std_dev),\n", + "}\n", + "\n", + "# 3) Prepare an output directory under data_dir\n", + "data_path = os.path.join(data_dir_path, \"00_Generate_Data\")\n", + "os.makedirs(data_path, exist_ok=True)\n", + "\n", + "use_cuda_if_avail = True\n", + "device = \"cuda\" if (use_cuda_if_avail and torch.cuda.is_available()) else \"cpu\"\n", + "\n", + "# 4) Try to instantiate Darcy2D (if Modulus is installed)\n", + "if Darcy2D is None:\n", + " print(\"[GenerateData] Darcy2D not available. Skipping PDE generation or using fallback.\")\n", + "else:\n", + " print(\"[GenerateData] Initializing Darcy2D data loader...\")\n", + " dataloader = Darcy2D(\n", + " resolution=cfg.training.resolution,\n", + " batch_size=cfg.training.batch_size,\n", + " normaliser=normaliser,\n", + " device=device\n", + " )\n", + "\n", + " # 5) Determine how many batches to save\n", + " num_batches_to_save = cfg.training.batch_size * 5\n", + " print(f\"[GenerateData] Generating and saving {num_batches_to_save} batch files to '{data_path}' ...\")\n", + "\n", + " for i in range(num_batches_to_save):\n", + " save_path = os.path.join(data_path, f\"darcy_batch_{i}.pt\")\n", + " if SKIP_EXISTING and os.path.exists(save_path):\n", + " print(f\"Skipping batch {i} -> file {save_path} already exists (SKIP_EXISTING=True).\")\n", + " continue\n", + "\n", + " batch = next(iter(dataloader))\n", + " torch.save(batch, save_path)\n", + " perm_shape = batch[\"permeability\"].shape\n", + " darcy_shape = batch[\"darcy\"].shape\n", + " print(f\"Saved batch {i} -> permeability {perm_shape}, darcy {darcy_shape}\")\n", + "\n", + " print(\"[GenerateData] Data generation complete!\")\n", + "\n", + "# 6) Create a comprehensive data descriptor (2D uniform grid, 2 channels, etc.)\n", + "data_descriptor = {\n", + " \"descriptor_name\": \"Darcy2D_Uniform_2Ch\",\n", + " \"data_structure\": {\n", + " \"dimension\": 2,\n", + " \"geometry_type\": \"grid\",\n", + " \"uniform\": True,\n", + " \"representation\": {\n", + " \"array_layout\": \"[N, H, W, C]\",\n", + " \"coordinate_mapping\": \"implicit uniform\",\n", + " \"coordinate_bounds\": {\n", + " \"x_min\": 0.0,\n", + " \"x_max\": 1.0,\n", + " \"y_min\": 0.0,\n", + " \"y_max\": 1.0\n", + " }\n", + " },\n", + " \"is_transient\": False,\n", + " \"boundary\": False,\n", + " \"boundary_info\": None,\n", + " \"cell_type\": None,\n", + " \"decimation\": False,\n", + " \"decimation_level\": None,\n", + " \"channels\": 2,\n", + " \"time_steps\": None,\n", + " \"adjacency\": False\n", + " },\n", + " \"metadata\": {\n", + " \"pde_type\": \"Darcy\",\n", + " \"num_samples\": cfg.training.batch_size * 5,\n", + " \"description\": (\n", + " \"2D Darcy dataset with 2 channels (e.g. 'permeability' and 'darcy'), \"\n", + " \"generated via Modulus Darcy2D with uniform grid.\"\n", + " ),\n", + " \"source_script\": \"00_Generate_Data.ipynb\",\n", + " \"file_pattern\": \"darcy_batch_*.pt\",\n", + " \"notes\": \"Each .pt file stores a dict with keys 'permeability' & 'darcy'.\"\n", + " }\n", + "}\n", + "\n", + "# 7) Save the descriptor to JSON in the same directory\n", + "data_desc_path = os.path.join(data_path, \"data_desc.json\")\n", + "with open(data_desc_path, \"w\") as f:\n", + " json.dump(data_descriptor, f, indent=2)\n", + "\n", + "print(f\"[GenerateData] Wrote data descriptor to: {data_desc_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 01_00 AutoML Candidate Model Selection\n", + "\n", + "Machine learning pipelines often involve a **model selection** phase, where we decide which model or models best fit a given dataset based on **data compatibility**, **user constraints**, and **desired outcomes** (e.g., accuracy vs. speed). In PDE-based workflows, this can become **challenging** because each **neural operator** or **graph-based surrogate** may expect different **geometry types** (mesh vs. grid), different channels or dimensionality, and so on. \n", + "\n", + "In this section, we demonstrate a **simple** version of an **AutoML** or **candidate model selection** pipeline. We show how to:\n", + "\n", + "1. **Load** the data descriptor generated from the previous step (`00_Generate_Data`). \n", + "2. **Initialize** a `ModelRegistry` containing metadata for our six candidate models (e.g., FNO, AFNO, DiffusionNet). \n", + "3. **Create** a `CandidateModelSelector` or logic component that can pick which model(s) to recommend. \n", + "4. **Validate** that the dataset’s descriptor is coherent and meets minimal requirements. \n", + "5. **Select** candidate models (for demonstration, we pick FNO or whichever is compatible). \n", + "6. **Retrieve** the target data structure each candidate model expects. \n", + "7. **Save** the results—so future steps can build on them for data transformations, training pipelines, or hyperparameter tuning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 01_00_01 Data Descriptor & Model Registry Initialization\n", + "\n", + "In this **first subsection**, we perform the **setup** needed to run the model selection routine:\n", + "\n", + "- **Load** the data descriptor from `data/00_Generate_Data/data_desc.json`, confirming it has the required **fields** (e.g., `dimension`, `geometry_type`, etc.). \n", + "- **Instantiate** our `ModelRegistry`, which knows about each candidate model’s **“accepted formats”**—like “2D uniform grid” for FNO or “3D unstructured mesh” for DiffusionNet. This registry can be extended later to incorporate **hyperparameters**, **training code references**, or **performance** metadata. \n", + "- (Optionally) **Preview** or **print** the loaded descriptor to ensure correctness.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data descriptor from 'examples/cfd/darcy_autoML_active_learning/data/00_Generate_Data/data_desc.json'...\n", + "[AutoML] Data descriptor loaded successfully. Domain Data Structures:\n", + " - 'descriptor_name': Darcy2D_Uniform_2Ch\n", + "[AutoML] ModelRegistry initialized. Available models:\n", + " - AFNO\n", + " - DiffusionNet\n", + " - FNOWithDropout\n", + " - FNO\n", + " - GraphCast\n", + " - NuFNO\n", + " - WNO\n", + "[AutoML] Data descriptor is valid.\n", + "[AutoML] Selected candidates: [('FNOWithDropout', 'candidate0'), ('AFNO', 'candidate1')]\n", + "[AutoML] For model 'FNOWithDropout' (key='candidate0'), the required data structure is:\n", + "[{'dimension': [2, 3], 'geometry_type': ['grid'], 'representations': [{'representation_name': 'uniform_grid', 'uniform': True, 'is_voxel_grid': False, 'is_transient_supported': False, 'channels_min': 1, 'channels_max': None, 'boundary_required': False, 'mesh_type': None, 'notes': 'Same shape requirements as vanilla FNO (e.g. [N, C, H, W] for 2D).'}]}]\n", + "[AutoML] For model 'AFNO' (key='candidate1'), the required data structure is:\n", + "[{'dimension': [2, 3], 'geometry_type': ['grid'], 'representations': [{'representation_name': 'uniform_grid', 'uniform': True, 'is_voxel_grid': False, 'channels_min': 1, 'channels_max': None, 'boundary_required': False, 'is_transient_supported': True, 'notes': 'Similar data layout to FNO: [N, C, H, W] or [N, C, D, H, W].'}]}]\n", + "[CandidateModelSelector] Saved 2 candidates to: examples/cfd/darcy_autoML_active_learning/data/01_00_AutoMLCandidateModelSelection/chosen_candidates.json\n", + "[AutoMLCandidateModelSelection] Results saved to: examples/cfd/darcy_autoML_active_learning/data/01_00_AutoMLCandidateModelSelection/chosen_candidates.json\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "from darcy_automl_active_learning.model_registry.model_registry import ModelRegistry\n", + "from darcy_automl_active_learning.model_selection.candidate_selector import CandidateModelSelector\n", + "from darcy_automl_active_learning.model_selection.selection_strategies import SimpleSelectionStrategy\n", + "# If you have a data descriptor utility:\n", + "# from darcy_automl_active_learning.data_descriptor.data_descriptor_utils import load_data_descriptor\n", + "\n", + "# 1) Identify the path to the data descriptor\n", + "data_desc_path = f\"{data_dir_path}/00_Generate_Data/data_desc.json\"\n", + "print(f\"Loading data descriptor from '{data_desc_path}'...\")\n", + "\n", + "# Optionally, load the descriptor directly if you want to do some pre-check:\n", + "with open(data_desc_path, \"r\") as f:\n", + " data_desc = json.load(f)\n", + "print(\"[AutoML] Data descriptor loaded successfully. Domain Data Structures:\")\n", + "print(f\" - 'descriptor_name': {data_desc.get('descriptor_name')}\")\n", + "\n", + "# 2) Initialize the ModelRegistry (defaults to the 6-7 candidate models)\n", + "registry = ModelRegistry()\n", + "print(\"[AutoML] ModelRegistry initialized. Available models:\")\n", + "for m in registry.list_models():\n", + " print(f\" - {m}\")\n", + "\n", + "# 3) Create our CandidateModelSelector using a strategy\n", + "# Here, we use a \"SimpleSelectionStrategy\" that always picks FNO, just for demonstration.\n", + "strategy = SimpleSelectionStrategy()\n", + "candidate_selector = CandidateModelSelector(\n", + " model_registry=registry,\n", + " selection_strategy=strategy\n", + ")\n", + "\n", + "# 4) Validate the data descriptor\n", + "# (If you have a custom load/validate function, call it here.\n", + "# Or you can rely on candidate_selector internally.)\n", + "is_valid = candidate_selector.validate_data_descriptor(data_desc_path)\n", + "if not is_valid:\n", + " raise ValueError(\"[AutoML] Data descriptor validation failed. Please fix the descriptor.\")\n", + "\n", + "print(\"[AutoML] Data descriptor is valid.\")\n", + "\n", + "# 5) Perform candidate model selection\n", + "selected_candidates = candidate_selector.automl_candidate_model_selection(data_desc_path)\n", + "print(\"[AutoML] Selected candidates:\", selected_candidates)\n", + "\n", + "# 6) For each selected model, retrieve the required data structure\n", + "for (model_name, candidate_key) in selected_candidates:\n", + " data_struct = candidate_selector.get_required_data_structure(model_name)\n", + " print(f\"[AutoML] For model '{model_name}' (key='{candidate_key}'), \"\n", + " f\"the required data structure is:\\n{data_struct}\")\n", + "\n", + "# 7) Save the chosen candidates to JSON so that later steps know which model(s) we plan to train\n", + "output_folder = f\"{data_dir_path}/01_00_AutoMLCandidateModelSelection\"\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "save_path = candidate_selector.save_candidate_models(selected_candidates, output_folder)\n", + "print(f\"[AutoMLCandidateModelSelection] Results saved to: {save_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 01_00_02 Candidate Model Selection & Validation\n", + "\n", + "In this **second subsection**, we apply the actual **selection** routine:\n", + "\n", + "1. **Validate** the data descriptor, ensuring it meets minimal fields (e.g. `dimension`, `geometry_type`) and that it’s consistent with the PDE problem.\n", + "2. **Apply** the selection logic, which might be as simple as “Pick FNO if the data is 2D–3D uniform,” or as complex as an **algorithm** that ranks models by performance, speed, or hyperparameter constraints.\n", + "3. For each **recommended** model, we fetch the **target data structure** (e.g. `{\"dimension\": [2,3], \"uniform\": true, ...}`) so we know what transformations might be needed.\n", + "4. **Store** or **save** the chosen candidates (and any relevant metadata) so we can retrieve them later when performing data transformations, training, or hyperparameter tuning." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Candidate: candidate0] Model: FNOWithDropout\n", + " - Required data structure: [{'dimension': [2, 3], 'geometry_type': ['grid'], 'representations': [{'representation_name': 'uniform_grid', 'uniform': True, 'is_voxel_grid': False, 'is_transient_supported': False, 'channels_min': 1, 'channels_max': None, 'boundary_required': False, 'mesh_type': None, 'notes': 'Same shape requirements as vanilla FNO (e.g. [N, C, H, W] for 2D).'}]}]\n", + " - Proposed transformation plan: {'model_name': 'FNOWithDropout', 'stages': [{'stage_name': '01_01_01_LoadRawData', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '00_Generate_Data', 'dest_folder': '01_01_LoadRawData', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_03_TransformRawData', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_LoadRawData', 'dest_folder': '01_01_03_TransformRawData', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_04_Preprocessing', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_03_TransformRawData', 'dest_folder': '01_01_04_Preprocessing', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_05_FeaturePreparation', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_04_Preprocessing', 'dest_folder': '01_01_05_FeaturePreparation', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}]} \n", + "\n", + "[Candidate: candidate1] Model: AFNO\n", + " - Required data structure: [{'dimension': [2, 3], 'geometry_type': ['grid'], 'representations': [{'representation_name': 'uniform_grid', 'uniform': True, 'is_voxel_grid': False, 'channels_min': 1, 'channels_max': None, 'boundary_required': False, 'is_transient_supported': True, 'notes': 'Similar data layout to FNO: [N, C, H, W] or [N, C, D, H, W].'}]}]\n", + " - Proposed transformation plan: {'model_name': 'AFNO', 'stages': [{'stage_name': '01_01_01_LoadRawData', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '00_Generate_Data', 'dest_folder': '01_01_LoadRawData', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_03_TransformRawData', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_LoadRawData', 'dest_folder': '01_01_03_TransformRawData', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_04_Preprocessing', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_03_TransformRawData', 'dest_folder': '01_01_04_Preprocessing', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}, {'stage_name': '01_01_05_FeaturePreparation', 'transform_ops': [{'method': 'copy_only', 'params': {'source_folder': '01_01_04_Preprocessing', 'dest_folder': '01_01_05_FeaturePreparation', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}]} \n", + "\n", + "[Info] Transformation plans saved to examples/cfd/darcy_autoML_active_learning/data/01_01_DataTransformationPlan/transformation_plans.json.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "from darcy_automl_active_learning.model_selection.candidate_selector import CandidateModelSelector\n", + "from darcy_automl_active_learning.ontology.ontology_engine import OntologyEngine\n", + "\n", + "########################\n", + "# 0) READ INPUTS\n", + "########################\n", + "\n", + "# Load the chosen candidates from the previous step (01_00 AutoMLCandidateModelSelection).\n", + "candidates_path = os.path.join(data_dir_path, \"01_00_AutoMLCandidateModelSelection\", \"chosen_candidates.json\")\n", + "with open(candidates_path, \"r\") as f:\n", + " selected_candidates = json.load(f)\n", + "# Example: selected_candidates = [[\"FNO\", \"candidate0\"], [\"DiffusionNet\", \"candidate1\"], ...]\n", + "\n", + "# Load the data descriptor (the \"source\" data structure).\n", + "data_desc_path = os.path.join(data_dir_path, \"00_Generate_Data\", \"data_desc.json\")\n", + "with open(data_desc_path, \"r\") as f:\n", + " data_desc = json.load(f)\n", + "\n", + "########################\n", + "# 1) INITIALIZE CLASSES\n", + "########################\n", + "ontology_engine = OntologyEngine()\n", + "candidate_selector = CandidateModelSelector(\n", + " model_registry=registry,\n", + " selection_strategy=strategy\n", + ")\n", + "\n", + "########################\n", + "# 2) GENERATE TRANSFORMATION PLANS\n", + "########################\n", + "\n", + "all_candidates_plans = {}\n", + "\n", + "for (model_name, candidate_key) in selected_candidates:\n", + " # Query the model registry/selector to get the \"desired/required data structure\"\n", + " target_data_struct = candidate_selector.get_required_data_structure(model_name)\n", + " \n", + " # Ask the ontology engine what transformations are needed\n", + " transformation_plan = ontology_engine.suggest_transformations(\n", + " source_data_desc=data_desc[\"data_structure\"],\n", + " target_data_requirements=target_data_struct,\n", + " model_name=model_name,\n", + " candidate_key=candidate_key,\n", + " data_dir_path=data_dir_path\n", + " )\n", + "\n", + " print(f\"[Candidate: {candidate_key}] Model: {model_name}\")\n", + " print(\" - Required data structure:\", target_data_struct)\n", + " print(\" - Proposed transformation plan:\", transformation_plan, \"\\n\")\n", + " \n", + " # Store the plan in a dictionary for future steps\n", + " all_candidates_plans[candidate_key] = {\n", + " \"model_name\": model_name,\n", + " \"plan\": transformation_plan\n", + " }\n", + "\n", + "# Optionally, save these plans to JSON so other notebook sections can load them\n", + "plans_output_path = os.path.join(data_dir_path, \"01_01_DataTransformationPlan\", \"transformation_plans.json\")\n", + "os.makedirs(os.path.dirname(plans_output_path), exist_ok=True)\n", + "\n", + "with open(plans_output_path, \"w\") as f:\n", + " json.dump(all_candidates_plans, f, indent=2)\n", + "\n", + "print(f\"[Info] Transformation plans saved to {plans_output_path}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_01 Data Loading and (Optional) Data Transformation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 01_01_01 LoadRawData\n", + "\n", + "In this subsection, we handle the **initial import** of raw `.pt` files generated during our **“00_Generate_Data”** step. These files typically contain PDE fields (e.g., **`permeability`**, **`darcy`**) that we’ll eventually feed into one or more candidate models. Specifically, we aim to:\n", + "\n", + "1. **Copy** the `.pt` files from `data/00_Generate_Data/` into a new folder, `data/01_01_LoadRawData/`. \n", + "2. **Preserve** the existing data descriptor (`data_desc.json`), ensuring we maintain consistent metadata on dimensions, geometry types, channels, etc. \n", + "3. Optionally perform minimal **exploratory data analysis (EDA)**—for instance, loading a sample `.pt` file and checking array shapes or key names.\n", + "\n", + "Why a separate **LoadRawData** step? By isolating this phase, we keep our pipeline **modular**: each section (loading, transforming, preprocessing, feature engineering) has its own folder and minimal concerns. This structure scales well to more complex PDE workflows or HPC environments, where multiple transformations or domain-specific checks might be added later.\n", + "\n", + "We’ll also demonstrate how we can integrate **transformation plans**—particularly the “01_01_01_LoadRawData” stage from our `transformation_plans.json`—to orchestrate these copy/EDA operations consistently. In a real production setting, you might expand this step to include **additional** data integrity checks (e.g., verifying file counts, ensuring no missing `.pt` or descriptor file), or HPC scheduling logic. For now, our **copy** operation and light EDA illustrate how to set a clear foundation for downstream tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LoadRawData] Loaded transformation plans from examples/cfd/darcy_autoML_active_learning/data/01_01_DataTransformationPlan/transformation_plans.json.\n", + "Candidate keys found: ['candidate0', 'candidate1']\n", + "\n", + "[LoadRawData] Processing candidate 'candidate0' for model 'FNOWithDropout'\n", + " -> Invoking 'copy_only' with params: {'source_folder': '00_Generate_Data', 'dest_folder': '01_01_LoadRawData', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}\n", + "[OntologyTransformationEngine] COPY_ONLY done: examples/cfd/darcy_autoML_active_learning/data/00_Generate_Data/candidate0 -> examples/cfd/darcy_autoML_active_learning/data/01_01_LoadRawData/candidate0\n", + "\n", + "[LoadRawData] Processing candidate 'candidate1' for model 'AFNO'\n", + " -> Invoking 'copy_only' with params: {'source_folder': '00_Generate_Data', 'dest_folder': '01_01_LoadRawData', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}\n", + "[OntologyTransformationEngine] COPY_ONLY done: examples/cfd/darcy_autoML_active_learning/data/00_Generate_Data/candidate1 -> examples/cfd/darcy_autoML_active_learning/data/01_01_LoadRawData/candidate1\n", + "\n", + "[LoadRawData] All candidates processed for stage '01_01_01_LoadRawData'.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from darcy_automl_active_learning.ontology.ontology_transformation_engine import OntologyTransformationEngine\n", + "\n", + "# 1) Path to the transformation plans (produced by OntologyEngine in an earlier step).\n", + "plans_json_path = os.path.join(data_dir_path, \"01_01_DataTransformationPlan\", \"transformation_plans.json\")\n", + "\n", + "if not os.path.exists(plans_json_path):\n", + " raise FileNotFoundError(f\"[LoadRawData] Cannot find transformation plans at {plans_json_path}\")\n", + "\n", + "# 2) Load the transformation plans\n", + "with open(plans_json_path, \"r\") as f:\n", + " all_candidates_plans = json.load(f)\n", + "\n", + "print(f\"[LoadRawData] Loaded transformation plans from {plans_json_path}.\")\n", + "print(\"Candidate keys found:\", list(all_candidates_plans.keys()))\n", + "\n", + "# Example structure of all_candidates_plans (dictionary):\n", + "# {\n", + "# \"candidate0\": { \"model_name\": \"FNOWithDropout\", \"plan\": {...} },\n", + "# \"candidate1\": { \"model_name\": \"AFNO\", \"plan\": {...} },\n", + "# ...\n", + "# }\n", + "\n", + "# 3) Instantiate our transformation engine\n", + "trans_engine = OntologyTransformationEngine()\n", + "\n", + "# 4) We'll iterate through each candidate, find the stage \"01_01_01_LoadRawData\",\n", + "# and execute its transform_ops in order.\n", + "for candidate_key, plan_info in all_candidates_plans.items():\n", + " # plan_info might look like:\n", + " # {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"plan\": {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"stages\": [\n", + " # { \n", + " # \"stage_name\": \"01_01_01_LoadRawData\",\n", + " # \"transform_ops\": [ { \"method\": \"copy_only\", \"params\": {...} }, ... ]\n", + " # },\n", + " # ...\n", + " # ]\n", + " # }\n", + " # }\n", + " model_name = plan_info[\"model_name\"]\n", + " plan_dict = plan_info[\"plan\"]\n", + "\n", + " print(f\"\\n[LoadRawData] Processing candidate '{candidate_key}' for model '{model_name}'\")\n", + "\n", + " # 4a) Retrieve the \"stages\" from plan_dict\n", + " stages = plan_dict.get(\"stages\", [])\n", + " # 4b) Filter for the stage we want -> \"01_01_01_LoadRawData\"\n", + " loadraw_stage = next(\n", + " (st for st in stages if st.get(\"stage_name\") == \"01_01_01_LoadRawData\"),\n", + " None\n", + " )\n", + "\n", + " if loadraw_stage is None:\n", + " print(f\" -> No '01_01_01_LoadRawData' stage found for candidate '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4c) Execute each transform_op\n", + " transform_ops = loadraw_stage.get(\"transform_ops\", [])\n", + " for op in transform_ops:\n", + " method_name = op[\"method\"]\n", + " params = op[\"params\"]\n", + "\n", + " print(f\" -> Invoking '{method_name}' with params: {params}\")\n", + " # We'll dispatch to the transformation engine methods\n", + " if hasattr(trans_engine, method_name):\n", + " method = getattr(trans_engine, method_name)\n", + " method(**params) # e.g. copy_only(source_folder, dest_folder)\n", + " else:\n", + " print(f\" -> [Warning] Transformation method '{method_name}' not found. Skipped.\")\n", + "\n", + "print(\"\\n[LoadRawData] All candidates processed for stage '01_01_01_LoadRawData'.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "#### 01_01_03 TransformRawData\n", + "\n", + "In this section, we apply (or simulate) **data transformations** needed by each **candidate model**. Recall that the previous step selected one or more target architectures (e.g., `\"FNO\"`, `\"AFNO\"`, `\"DiffusionNet\"`) and assigned them labels (`\"candidate0\"`, `\"candidate1\"`, etc.). Here, each candidate’s data flows from **`01_01_LoadRawData`** into a **new** subfolder—for instance, `data/01_01_03_TransformRawData/candidate0/`.\n", + "\n", + "1. **Copy** the `.pt` files from the “LoadRawData” folder. \n", + "2. **Transform** them per model requirements (if necessary).\n", + "\n", + "> **Why Transform?** \n", + "> Different models impose distinct constraints on the dataset’s **geometry**, **resolution**, or **channels**. From an HPC or PDE perspective, transformations ensure the raw data aligns with each model’s assumptions—such as **spectral operators** needing uniform spacing or **mesh-based operators** expecting unstructured vertex/face data.\n", + "\n", + "> **Possible Transformations** might include:\n", + "> - **TRANSFORM_MESH_TO_GRID**: \n", + "> Convert unstructured mesh data to a uniform grid—necessary if you want to feed a domain with irregular elements into a **Fourier** or **wavelet** operator that performs global transforms along regular axes. This can involve interpolation or resampling of original nodal values. \n", + ">\n", + "> - **TRANSFORM_DECIMATE_MESH**: \n", + "> Downsample or reduce vertex count for large HPC-generated meshes. This is often needed if memory constraints or real-time performance requires smaller data sets. Decimation should preserve key PDE features or boundaries without losing critical geometry detail. \n", + ">\n", + "> - **TRANSFORM_REGRID_DATA**: \n", + "> Change resolution from, say, 128×128 to 64×64, matching a model’s input dimension or training memory budget. This is especially relevant for **FNO/AFNO** if your PDE solver originally output very high resolution. \n", + ">\n", + "> - **TRANSFORM_ADD_BOUNDARY_CHANNEL**: \n", + "> Insert an extra channel labeling boundary indices, inlet/outlet regions, or domain interfaces. Many PDE surrogates benefit from explicitly differentiating boundary conditions. \n", + ">\n", + "> - **TRANSFORM_COORDINATE_MAPPING**: \n", + "> Adjust coordinate references (e.g., non-uniform → uniform) or embed extra coordinate fields (e.g., adding `(x, y)` grids as input channels). Useful for **PINNs** or operator-learning methods that rely on positional encodings. \n", + ">\n", + "> - **TRANSFORM_NORMALIZE_TENSORS**: \n", + "> Scale PDE fields to a standard range or distribution (e.g., zero mean, unit variance). This can stabilize training by preventing large differences in scales across multiple PDE variables (e.g., velocity vs. pressure). \n", + ">\n", + "> - **TRANSFORM_TIME_SUBSAMPLING** (if transient data): \n", + "> Select or downsample time steps from a high-frequency simulation if your surrogate only needs coarse temporal resolution.\n", + "\n", + "> **Minimal or Custom** \n", + "> Sometimes, no transformation is needed if the dataset already matches the model’s expected shape (e.g., a 2D uniform grid for FNO). In other scenarios—especially bridging drastically different data formats—transforms can be **extensive** (e.g., partial **voxelization** or complex manifold parameterization for unstructured surfaces).\n", + "\n", + "**Implementation Outline** \n", + "1. **Identify** the selected models and their subfolders (e.g., `\"candidate0\"` for FNO). \n", + "2. **Gather** relevant transformations from a “transformation plan” (possibly stored in JSON or a Python object). \n", + "3. **Apply** the transformations in sequence to each `.pt` file (or geometry file) from the previous step:\n", + " - Each step modifies shapes, channels, geometry format, or resolution. \n", + " - If no transform is required, the script simply copies the data. \n", + "4. **Save** the result in `01_01_03_TransformRawData/candidateX/` with an **updated `data_desc.json`** if the geometry or channels changed. That descriptor now reflects the new data layout (e.g., from `dimension=3, geometry_type=\"mesh\"` to `dimension=2, geometry_type=\"grid\"`).\n", + "\n", + "By isolating each candidate’s transformed data, we keep the pipeline modular, ensuring that subsequent **Preprocessing** or **FeaturePreparation** steps can be tailored per model. For demonstration, we’ll parse a JSON file listing our chosen candidates (e.g., `[[\"FNO\", \"candidate0\"], ...]`) and apply a minimal transform (or copying) to confirm the pipeline structure. In a production scenario, you might incorporate advanced geometry libraries (e.g., PyVista, VTK) or PDE-aware boundary labeling at this stage, especially in HPC contexts where domain complexity is high." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TransformRawData] Loaded transformation plans from examples/cfd/darcy_autoML_active_learning/data/01_01_DataTransformationPlan/transformation_plans.json.\n", + "Candidate keys found: ['candidate0', 'candidate1']\n", + "\n", + "[TransformRawData] Processing candidate 'candidate0' for model 'FNOWithDropout'\n", + " -> Invoking 'copy_only' with params: {'source_folder': '01_01_LoadRawData', 'dest_folder': '01_01_03_TransformRawData', 'subfolder_source': 'candidate0', 'subfolder_dest': 'candidate0', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}\n", + "[OntologyTransformationEngine] COPY_ONLY done: examples/cfd/darcy_autoML_active_learning/data/01_01_LoadRawData/candidate0 -> examples/cfd/darcy_autoML_active_learning/data/01_01_03_TransformRawData/candidate0\n", + "\n", + "[TransformRawData] Processing candidate 'candidate1' for model 'AFNO'\n", + " -> Invoking 'copy_only' with params: {'source_folder': '01_01_LoadRawData', 'dest_folder': '01_01_03_TransformRawData', 'subfolder_source': 'candidate1', 'subfolder_dest': 'candidate1', 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}\n", + "[OntologyTransformationEngine] COPY_ONLY done: examples/cfd/darcy_autoML_active_learning/data/01_01_LoadRawData/candidate1 -> examples/cfd/darcy_autoML_active_learning/data/01_01_03_TransformRawData/candidate1\n", + "\n", + "[TransformRawData] All candidates processed for stage '01_01_03_TransformRawData'.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from darcy_automl_active_learning.ontology.ontology_transformation_engine import OntologyTransformationEngine\n", + "\n", + "# 1) Path to the transformation plans (produced by OntologyEngine in an earlier step).\n", + "plans_json_path = os.path.join(data_dir_path, \"01_01_DataTransformationPlan\", \"transformation_plans.json\")\n", + "\n", + "if not os.path.exists(plans_json_path):\n", + " raise FileNotFoundError(f\"[TransformRawData] Cannot find transformation plans at {plans_json_path}\")\n", + "\n", + "# 2) Load the transformation plans\n", + "with open(plans_json_path, \"r\") as f:\n", + " all_candidates_plans = json.load(f)\n", + "\n", + "print(f\"[TransformRawData] Loaded transformation plans from {plans_json_path}.\")\n", + "print(\"Candidate keys found:\", list(all_candidates_plans.keys()))\n", + "\n", + "# Example structure of all_candidates_plans (dictionary):\n", + "# {\n", + "# \"candidate0\": { \"model_name\": \"FNOWithDropout\", \"plan\": {...} },\n", + "# \"candidate1\": { \"model_name\": \"AFNO\", \"plan\": {...} },\n", + "# ...\n", + "# }\n", + "\n", + "# 3) Instantiate our transformation engine\n", + "trans_engine = OntologyTransformationEngine()\n", + "\n", + "# 4) We'll iterate through each candidate, find the stage \"01_01_03_TransformRawData\",\n", + "# and execute its transform_ops in order.\n", + "for candidate_key, plan_info in all_candidates_plans.items():\n", + " # plan_info might look like:\n", + " # {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"plan\": {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"stages\": [\n", + " # {\n", + " # \"stage_name\": \"01_01_01_LoadRawData\",\n", + " # \"transform_ops\": [...]\n", + " # },\n", + " # {\n", + " # \"stage_name\": \"01_01_03_TransformRawData\",\n", + " # \"transform_ops\": [...]\n", + " # },\n", + " # ...\n", + " # ]\n", + " # }\n", + " # }\n", + " model_name = plan_info[\"model_name\"]\n", + " plan_dict = plan_info[\"plan\"]\n", + "\n", + " print(f\"\\n[TransformRawData] Processing candidate '{candidate_key}' for model '{model_name}'\")\n", + "\n", + " # 4a) Retrieve the list of stages from plan_dict\n", + " stages = plan_dict.get(\"stages\", [])\n", + "\n", + " # 4b) Look for the stage named \"01_01_03_TransformRawData\"\n", + " transformraw_stage = next(\n", + " (st for st in stages if st.get(\"stage_name\") == \"01_01_03_TransformRawData\"),\n", + " None\n", + " )\n", + "\n", + " if transformraw_stage is None:\n", + " print(f\" -> No '01_01_03_TransformRawData' stage found for candidate '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4c) Execute each transform_op in that stage\n", + " transform_ops = transformraw_stage.get(\"transform_ops\", [])\n", + " for op in transform_ops:\n", + " method_name = op[\"method\"]\n", + " params = op[\"params\"]\n", + "\n", + " # Log the operation\n", + " print(f\" -> Invoking '{method_name}' with params: {params}\")\n", + "\n", + " # Dispatch to the transformation engine methods\n", + " if hasattr(trans_engine, method_name):\n", + " method = getattr(trans_engine, method_name)\n", + " method(**params) # e.g., copy_only(source_folder, dest_folder)\n", + " else:\n", + " print(f\" -> [Warning] Transformation method '{method_name}' not found. Skipped.\")\n", + "\n", + "print(\"\\n[TransformRawData] All candidates processed for stage '01_01_03_TransformRawData'.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "#### 01_01_04 Preprocessing\n", + "\n", + "Even after **data transformations** (e.g., re-gridding or mesh decimation), **real-world PDE workflows** frequently require **additional** refinement before final training. This **preprocessing** ensures the dataset is consistently formatted, free of corruption, and enriched with any domain-specific metadata. Common operations might include:\n", + "\n", + "- **Geometry Augmentation** \n", + " Performing random translations, rotations, or domain cropping to enhance model robustness and generalization. \n", + "\n", + "- **Cleaning & Filtering** \n", + " - **`PREPROC_REMOVE_OUTLIERS`**: Identifying and removing aberrant data points (e.g., extremely large velocities or pressures that arise from solver instabilities). \n", + " - **`PREPROC_DETECT_REPLACE_NANS`**: Automatically detecting `NaN` or infinite values and replacing them with defaults (e.g., zeros) or discarding those samples. \n", + " - **`PREPROC_FILTER_INCOMPLETE_SAMPLES`**: Skipping data entries where certain channels or geometry components are missing (e.g., partial PDE fields). \n", + "\n", + "- **Domain-Specific Preprocessing** \n", + " - **`PREPROC_LOG_STATS`**: Logging basic statistics (mean, std, min/max) per channel or boundary region for QA/QC. \n", + " - **`PREPROC_ADD_BOUNDARY_LABELS`**: Adding specialized boundary or interface labels, if not handled in the transform phase. \n", + " - **`PREPROC_ADD_CUSTOM_COORDS`**: Incorporating advanced parameterizations (e.g., polar/spherical coordinates for circular or spherical domains). \n", + " - **`PREPROC_MULTI_PHYSICS_COMBINE`**: Merging multiple PDE fields (e.g., fluid + thermal data) into a unified feature map.\n", + "\n", + "---\n", + "\n", + "In our **prototype** pipeline, we **simplify** preprocessing to minimal or **no** extra modifications. We essentially:\n", + "\n", + "1. **Copy** each candidate’s transformed data files into a **`01_01_04_Preprocessing`** folder. \n", + "2. **Optionally** perform consistency checks (ensuring each `.pt` file has the expected dimensions, verifying boundary channels exist if required, etc.).\n", + "\n", + "> **Why keep this step separate?** \n", + "> Preprocessing is distinct from core transformations because it can be **highly domain-specific** and may evolve over time. For instance, advanced HPC or industrial PDE pipelines might integrate strict validation rules (e.g., confirming mesh connectivity, verifying boundary compliance).\n", + "\n", + "> **Extending Preprocessing** \n", + "> In a **production** setup, you might expand this step to automate the following:\n", + "> - **`PREPROC_REMOVE_OUTLIERS`** and **`PREPROC_DETECT_REPLACE_NANS`** to ensure data integrity. \n", + "> - **`PREPROC_LOG_STATS`** to capture summarizing metrics in a QA/QC log. \n", + "> - **`PREPROC_ADD_BOUNDARY_LABELS`** to incorporate more sophisticated geometry masks. \n", + "> - **Integration** with anomaly detection networks to flag suspicious samples or **retain** high-value domain extremes.\n", + "\n", + "For now, our function (`do_preprocessing_for_candidates`) remains a **placeholder** indicating where these operations would occur. Future versions can expand domain-specific logic as needed, ensuring each candidate’s data is **validated**, **cleaned**, and **augmented** prior to **FeaturePreparation** or final model training." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'candidate0': {'model_name': 'FNOWithDropout',\n", + " 'plan': {'candidate0': {'model_name': 'FNOWithDropout',\n", + " 'plan': {'model_name': 'FNOWithDropout',\n", + " 'stages': [{'stage_name': '01_01_01_LoadRawData',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '00_Generate_Data',\n", + " 'dest_folder': '01_01_LoadRawData',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_03_TransformRawData',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_LoadRawData',\n", + " 'dest_folder': '01_01_03_TransformRawData',\n", + " 'subfolder_source': 'candidate0',\n", + " 'subfolder_dest': 'candidate0',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_04_Preprocessing',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_03_TransformRawData',\n", + " 'dest_folder': '01_01_04_Preprocessing',\n", + " 'subfolder_source': 'candidate0',\n", + " 'subfolder_dest': 'candidate0',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_05_FeaturePreparation',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_04_Preprocessing',\n", + " 'dest_folder': '01_01_05_FeaturePreparation',\n", + " 'subfolder_source': 'candidate0',\n", + " 'subfolder_dest': 'candidate0',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}]}},\n", + " 'candidate1': {'model_name': 'AFNO',\n", + " 'plan': {'model_name': 'AFNO',\n", + " 'stages': [{'stage_name': '01_01_01_LoadRawData',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '00_Generate_Data',\n", + " 'dest_folder': '01_01_LoadRawData',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_03_TransformRawData',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_LoadRawData',\n", + " 'dest_folder': '01_01_03_TransformRawData',\n", + " 'subfolder_source': 'candidate1',\n", + " 'subfolder_dest': 'candidate1',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_04_Preprocessing',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_03_TransformRawData',\n", + " 'dest_folder': '01_01_04_Preprocessing',\n", + " 'subfolder_source': 'candidate1',\n", + " 'subfolder_dest': 'candidate1',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]},\n", + " {'stage_name': '01_01_05_FeaturePreparation',\n", + " 'transform_ops': [{'method': 'copy_only',\n", + " 'params': {'source_folder': '01_01_04_Preprocessing',\n", + " 'dest_folder': '01_01_05_FeaturePreparation',\n", + " 'subfolder_source': 'candidate1',\n", + " 'subfolder_dest': 'candidate1',\n", + " 'data_dir_path': 'examples/cfd/darcy_autoML_active_learning/data'}}]}]}}}}}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Preprocessing] Loaded transformation plans from examples/cfd/darcy_autoML_active_learning/data/01_01_DataTransformationPlan/transformation_plans.json.\n", + "Candidate keys found: ['candidate0']\n", + "\n", + "[Preprocessing] Processing candidate 'candidate0' for model 'FNOWithDropout'\n", + " -> No '01_01_04_Preprocessing' stage found for candidate 'candidate0'. Skipping.\n", + "\n", + "[Preprocessing] All candidates processed for stage '01_01_04_Preprocessing'.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from darcy_automl_active_learning.ontology.ontology_transformation_engine import OntologyTransformationEngine\n", + "\n", + "# 1) Path to the transformation plans\n", + "plans_json_path = os.path.join(data_dir_path, \"01_01_DataTransformationPlan\", \"transformation_plans.json\")\n", + "if not os.path.exists(plans_json_path):\n", + " raise FileNotFoundError(f\"[Preprocessing] Cannot find transformation plans at {plans_json_path}\")\n", + "\n", + "# 2) Load the transformation plans\n", + "with open(plans_json_path, \"r\") as f:\n", + " all_candidates_plans = json.load(f)\n", + "\n", + "print(f\"[Preprocessing] Loaded transformation plans from {plans_json_path}.\")\n", + "print(\"Candidate keys found:\", list(all_candidates_plans.keys()))\n", + "\n", + "# 3) Instantiate or reuse the transformation engine\n", + "trans_engine = OntologyTransformationEngine()\n", + "\n", + "# We'll assume the source folder is \"01_01_03_TransformRawData\"\n", + "source_root = os.path.join(data_dir_path, \"01_01_03_TransformRawData\")\n", + "\n", + "# 4) For each candidate, find the stage \"01_01_04_Preprocessing\" and execute\n", + "for candidate_key, plan_info in all_candidates_plans.items():\n", + "\n", + " model_name = plan_info[\"model_name\"]\n", + " plan_dict = plan_info[\"plan\"]\n", + "\n", + " print(f\"\\n[Preprocessing] Processing candidate '{candidate_key}' for model '{model_name}'\")\n", + "\n", + " # 4a) Retrieve the list of stages\n", + " stages = plan_dict.get(\"stages\", [])\n", + "\n", + " # 4b) Look for \"01_01_04_Preprocessing\"\n", + " preprocessing_stage = next(\n", + " (st for st in stages if st.get(\"stage_name\") == \"01_01_04_Preprocessing\"),\n", + " None\n", + " )\n", + "\n", + " if preprocessing_stage is None:\n", + " print(f\" -> No '01_01_04_Preprocessing' stage found for candidate '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4c) Retrieve transform_ops\n", + " transform_ops = preprocessing_stage.get(\"transform_ops\", [])\n", + " if not transform_ops:\n", + " print(f\" -> '01_01_04_Preprocessing' has no transform_ops for '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4d) Create the destination folder\n", + " dest_folder = os.path.join(data_dir_path, \"01_01_04_Preprocessing\", candidate_key)\n", + " os.makedirs(dest_folder, exist_ok=True)\n", + "\n", + " # 4e) Execute each transform method\n", + " for op in transform_ops:\n", + " method_name = op[\"method\"]\n", + " params = op[\"params\"]\n", + "\n", + " # Override the source/dest for clarity (common pattern in your pipeline)\n", + " params[\"source_folder\"] = os.path.join(source_root, candidate_key)\n", + " params[\"dest_folder\"] = dest_folder\n", + "\n", + " print(f\" -> Invoking '{method_name}' with params: {params}\")\n", + "\n", + " if hasattr(trans_engine, method_name):\n", + " method = getattr(trans_engine, method_name)\n", + " method(**params)\n", + " else:\n", + " print(f\" -> [Warning] Method '{method_name}' not found in transformation engine. Skipped.\")\n", + "\n", + " print(f\" -> Finished preprocessing for candidate '{candidate_key}'.\")\n", + "\n", + "print(\"\\n[Preprocessing] All candidates processed for stage '01_01_04_Preprocessing'.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "#### 01_01_05 FeaturePreparation\n", + "\n", + "Even with **preprocessing** in place, many **PDE workflows** can benefit from **feature engineering** to give models the best possible representation of the domain. Such feature engineering often targets **input** channels or **auxiliary** data that helps the model learn PDE patterns more effectively. Typical operations may include:\n", + "\n", + "- **Boundary Channel Additions** \n", + " - **`FEATURE_ADD_BOUNDARY_MASK`**: Creating a channel that flags boundary nodes or cells (e.g., 1 at boundary, 0 in the interior). This clarifies region distinctions for the model. \n", + " - **`FEATURE_MESH_ADJ_INFO`**: For mesh-based PDEs, encoding adjacency or connectivity in a way that the model can leverage more directly.\n", + "\n", + "- **Coordinate Expansions** \n", + " - **`FEATURE_ADD_COORDS`**: Injecting explicit \\((x, y)\\) or \\((x, y, z)\\) coordinates into each data sample if they’re not already included. \n", + " - **`FEATURE_TRANSFORM_COORDS`**: Converting from Cartesian to polar/spherical coordinates for certain domains or PDE problems.\n", + "\n", + "- **Channel Rearrangements & Combinations** \n", + " - **`FEATURE_STACK_INPUTS`**: Stacking multiple PDE fields (e.g., temperature + velocity) into a single input tensor. \n", + " - **`FEATURE_SPLIT_FIELDS`**: Splitting one multi-channel input into separate sub-tensors for specialized architectures.\n", + "\n", + "- **Scaling or Normalizing Fields** \n", + " - **`FEATURE_SCALE_CHANNELS`**: Applying scaling or normalization to each channel (e.g., min–max scaling or standard deviation normalization) after domain-specific preprocessing. \n", + " - **`FEATURE_LOG_TRANSFORM`**: Sometimes used for PDE variables that span multiple magnitudes (e.g., exponential growth in wave amplitude or flow velocity).\n", + "\n", + "- **Noise Injection & Data Augmentation** \n", + " - **`FEATURE_ADD_NOISE`**: Introducing mild noise for regularization or simulating measurement uncertainty in sensor-based PDE data. \n", + " - **`FEATURE_AUGMENT_GEOMETRY`**: Additional geometric transformations (e.g., flips, slight domain perturbations) that specifically enhance feature diversity.\n", + "\n", + "---\n", + "\n", + "**Prototype Implementation** \n", + "In our current pipeline, we keep **FeaturePreparation** **minimal**—doing little more than **copying** the data to a new folder. However, this step represents a natural **extension point** for domain-specific feature engineering. We envision a function `prepare_features_for_candidates(...)` in **`src/feature_engineering.py`** that could eventually:\n", + "\n", + "1. **Verify** the presence of core PDE channels (e.g., `permeability`, `pressure`, `velocity`). \n", + "2. **Combine** or **split** channels as needed for a given architecture (e.g., wavelet vs. graph-based). \n", + "3. **Inject** boundary masks or coordinate arrays if a model demands explicit domain context.\n", + "\n", + "By **decoupling** this from the earlier **Preprocessing** (which focuses on data cleaning and consistency), we ensure that model-specific or domain-specific feature engineering can **evolve** independently. Over time, additional transformations (like **`FEATURE_ADD_COORDS`**, **`FEATURE_SPLIT_FIELDS`**, or **`FEATURE_LOG_TRANSFORM`**) can be integrated without disrupting the rest of the pipeline. As a result, each candidate model can have precisely the **feature representation** it needs to learn effectively from the PDE data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[FeaturePreparation] Loaded transformation plans from examples/cfd/darcy_autoML_active_learning/data/01_01_DataTransformationPlan/transformation_plans.json.\n", + "Candidate keys found: ['candidate0']\n", + "\n", + "[FeaturePreparation] Processing candidate 'candidate0' for model 'FNOWithDropout'\n", + " -> No '01_01_05_FeaturePreparation' stage found for candidate 'candidate0'. Skipping.\n", + "\n", + "[FeaturePreparation] All candidates processed for stage '01_01_05_FeaturePreparation'.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from darcy_automl_active_learning.ontology.ontology_transformation_engine import OntologyTransformationEngine\n", + "\n", + "# 1) Path to the transformation plans\n", + "plans_json_path = os.path.join(data_dir_path, \"01_01_DataTransformationPlan\", \"transformation_plans.json\")\n", + "if not os.path.exists(plans_json_path):\n", + " raise FileNotFoundError(f\"[FeaturePreparation] Cannot find transformation plans at {plans_json_path}\")\n", + "\n", + "# 2) Load the transformation plans\n", + "with open(plans_json_path, \"r\") as f:\n", + " all_candidates_plans = json.load(f)\n", + "\n", + "print(f\"[FeaturePreparation] Loaded transformation plans from {plans_json_path}.\")\n", + "print(\"Candidate keys found:\", list(all_candidates_plans.keys()))\n", + "\n", + "# 3) Instantiate (or reuse) the transformation engine\n", + "trans_engine = OntologyTransformationEngine()\n", + "\n", + "# We'll assume the source folder is \"01_01_04_Preprocessing\"\n", + "source_root = os.path.join(data_dir_path, \"01_01_04_Preprocessing\")\n", + "\n", + "# 4) For each candidate, look for the stage \"01_01_05_FeaturePreparation\" and execute\n", + "for candidate_key, plan_info in all_candidates_plans.items():\n", + " # plan_info typically looks like:\n", + " # {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"plan\": {\n", + " # \"model_name\": \"FNOWithDropout\",\n", + " # \"stages\": [\n", + " # {\n", + " # \"stage_name\": \"01_01_01_LoadRawData\",\n", + " # \"transform_ops\": [...]\n", + " # },\n", + " # {\n", + " # \"stage_name\": \"01_01_05_FeaturePreparation\",\n", + " # \"transform_ops\": [...]\n", + " # },\n", + " # ...\n", + " # ]\n", + " # }\n", + " # }\n", + "\n", + " model_name = plan_info[\"model_name\"]\n", + " plan_dict = plan_info[\"plan\"]\n", + "\n", + " print(f\"\\n[FeaturePreparation] Processing candidate '{candidate_key}' for model '{model_name}'\")\n", + "\n", + " # 4a) Retrieve the list of stages\n", + " stages = plan_dict.get(\"stages\", [])\n", + "\n", + " # 4b) Find the \"01_01_05_FeaturePreparation\" stage\n", + " featureprep_stage = next(\n", + " (st for st in stages if st.get(\"stage_name\") == \"01_01_05_FeaturePreparation\"),\n", + " None\n", + " )\n", + "\n", + " if featureprep_stage is None:\n", + " print(f\" -> No '01_01_05_FeaturePreparation' stage found for candidate '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4c) Retrieve transform_ops\n", + " transform_ops = featureprep_stage.get(\"transform_ops\", [])\n", + " if not transform_ops:\n", + " print(f\" -> '01_01_05_FeaturePreparation' has no transform_ops for '{candidate_key}'. Skipping.\")\n", + " continue\n", + "\n", + " # 4d) Create the destination folder\n", + " dest_folder = os.path.join(data_dir_path, \"01_01_05_FeaturePreparation\", candidate_key)\n", + " os.makedirs(dest_folder, exist_ok=True)\n", + "\n", + " # 4e) Execute each transformation operation\n", + " for op in transform_ops:\n", + " method_name = op[\"method\"]\n", + " params = op[\"params\"]\n", + "\n", + " # Override the source/dest for clarity\n", + " params[\"source_folder\"] = os.path.join(source_root, candidate_key)\n", + " params[\"dest_folder\"] = dest_folder\n", + "\n", + " print(f\" -> Invoking '{method_name}' with params: {params}\")\n", + "\n", + " if hasattr(trans_engine, method_name):\n", + " method = getattr(trans_engine, method_name)\n", + " method(**params)\n", + " else:\n", + " print(f\" -> [Warning] Method '{method_name}' not found in transform engine. Skipped.\")\n", + "\n", + " print(f\" -> Finished feature preparation for candidate '{candidate_key}'.\")\n", + "\n", + "print(\"\\n[FeaturePreparation] All candidates processed for stage '01_01_05_FeaturePreparation'.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### Conclusion: Data Pipeline Ready\n", + "\n", + "We have successfully completed an **end-to-end data pipeline** for our Darcy Flow (or more generally PDE-based) project, incorporating:\n", + "\n", + "1. **Data Descriptor & Model Compatibility**\n", + " - Defined a **comprehensive data descriptor** (`data_desc.json`) capturing dimension, geometry type, uniformity, and more.\n", + " - Used a `ModelRegistry` and a simple `AutoMLCandidateModelSelection` to verify which candidate models (e.g., FNO, AFNO) can directly consume our dataset, or if transformations are required.\n", + "\n", + "2. **Raw Data Loading (`01_01_LoadRawData`)**\n", + " - Copied raw `.pt` files from `data/00_Generate_Data` to `data/01_01_LoadRawData`.\n", + " - Preserved the data descriptor for consistency.\n", + " - Performed minimal Exploratory Data Analysis (EDA) to confirm file integrity and shapes (e.g., checking `\"permeability\"`, `\"darcy\"`).\n", + "\n", + "3. **Transforming Raw Data (`01_01_03_TransformRawData`)**\n", + " - For each **candidate model** (e.g., `candidate0`, `candidate1`), created a dedicated subfolder in `data/01_03_TransformRawData`.\n", + " - Demonstrated how to handle **typical PDE transformations**:\n", + " - **`TRANSFORM_MESH_TO_GRID`**: Converting an unstructured mesh to a uniform grid if required by a spectral-based operator.\n", + " - **`TRANSFORM_DECIMATE_MESH`**: Reducing mesh complexity for memory or performance constraints.\n", + " - **`TRANSFORM_REGRID_DATA`**: Adjusting resolution or coordinate spacing to match model expectations.\n", + " - **`TRANSFORM_APPLY_BC_AUGMENTATION`**: Incorporating boundary-condition channels (if not added earlier).\n", + " - **`TRANSFORM_NORMALIZE`**: Standardizing or normalizing PDE fields (e.g., substract mean, divide by std).\n", + " - Kept transformations minimal in this prototype, but laid out the structure for more sophisticated re-gridding or domain modifications if needed.\n", + "\n", + "4. **Preprocessing (`01_01_04_Preprocessing`)**\n", + " - Introduced a **placeholder** for additional PDE data modifications, including:\n", + " - **`PREPROC_GEOMETRY_AUGMENT`**: Random rotations, domain cropping, or flips.\n", + " - **`PREPROC_REMOVE_OUTLIERS`**: Filtering extreme or invalid values.\n", + " - **`PREPROC_DETECT_REPLACE_NANS`**: Handling missing or corrupted data points.\n", + " - **`PREPROC_FILTER_INCOMPLETE_SAMPLES`**: Removing partial or malformed data entries.\n", + " - Copied the transformed data for each candidate into `data/01_04_Preprocessing`, ensuring any domain- or application-specific cleaning can be done here.\n", + "\n", + "5. **Feature Preparation (`01_01_05_FeaturePreparation`)**\n", + " - Final stage of data engineering before **model training**, covering potential:\n", + " - **Boundary Mask Channels** (e.g., `FEATURE_ADD_BOUNDARY_MASK`).\n", + " - **Coordinate Expansions** (`FEATURE_ADD_COORDS`), if needed for operator-based PDE solvers.\n", + " - **Channel Stacking** (`FEATURE_STACK_INPUTS`) or **Splitting** (`FEATURE_SPLIT_FIELDS`) to reorganize PDE fields.\n", + " - **Scaling** or **Augmentation** for the final inputs (e.g., `FEATURE_SCALE_CHANNELS` or `FEATURE_ADD_NOISE`).\n", + " - Copied or updated files under `data/01_05_FeaturePreparation`, providing a flexible hook for advanced PDE-specific feature engineering.\n", + "\n", + "**Outcome & Next Steps** \n", + "All data are now **cleaned**, **transformed**, and **feature-engineered** in a structured manner, ready for **surrogate model training** or **AutoML** hyperparameter tuning. Our project’s data folders now look like:\n", + "\n", + "```\n", + "data/\n", + " ├─ 00_Generate_Data/\n", + " │ └─ data_desc.json\n", + " ├─ 01_00_AutoMLCandidateModelSelection/\n", + " │ └─ chosen_candidates.json\n", + " ├─ 01_01_LoadRawData/\n", + " ├─ 01_03_TransformRawData/\n", + " │ ├─ candidate0/\n", + " │ └─ candidate1/\n", + " ├─ 01_04_Preprocessing/\n", + " │ ├─ candidate0/\n", + " │ └─ candidate1/\n", + " └─ 01_05_FeaturePreparation/\n", + " ├─ candidate0/\n", + " └─ candidate1/\n", + "```\n", + "\n", + "With the **data pipeline** complete, we can move on to **model definition**, **training**, and (optionally) **AutoML** tasks such as hyperparameter optimization or multi-model experimentation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_02 Model Definition\n", + "\n", + "In this section, we introduce our primary PDE surrogate model definitions. We focus on two main variants:\n", + "\n", + "1. **FNOWithDropout** – A custom subclass of Modulus’s Fourier Neural Operator (FNO) that injects dropout. This allows us to do Monte Carlo Dropout–based uncertainty estimation or simply add a regularization mechanism.\n", + "2. **AFNO** – NVIDIA Modulus’s Adaptive Fourier Neural Operator, which uses an adaptive frequency gating approach for improved spectral flexibility.\n", + "\n", + "Both surrogates rely on hyperparameter definitions stored in our `config.yaml` under `cfg.arch.fno.*` or `cfg.arch.afno.*`. By default, we’ll pull settings like `in_channels`, `out_channels`, `latent_channels`, `drop` (dropout rate), and so on directly from `config.yaml`. You can override these values in the notebook if needed—just edit the `cfg` object before creating the models.\n", + "\n", + "We’ll keep the actual model classes (and any helper functions) in `src/models.py` (or sub-files like `fno_dropout.py`, `afno.py`), each thoroughly documented with docstrings. Then, in the next cells, we’ll show how to use these classes in conjunction with the config fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_03 Model Factory\n", + "\n", + "This section focuses on **merging our user configuration** (especially the field `cfg.model_name`) with the model definitions created in “01_02 Model Definition.” By doing so, we can **automate** which PDE surrogate to build—be it an FNO-based model, AFNO, or a future extension (like a PINN or DiffusionNet). \n", + "\n", + "**Why a Factory?** It lets us keep a **single** entry point (`get_model(cfg)`), which reads the relevant parameters (`cfg.arch.fno.*`, `cfg.arch.afno.*`, etc.) and returns the correct PyTorch module. This modular approach also makes it straightforward to **add** new model variants (e.g., a different neural operator) without changing the notebook workflow. \n", + "\n", + "In the following steps, we’ll:\n", + "1. Create a new file, `model_factory.py`, that defines `get_model(cfg)` (with docstrings).\n", + "2. Demonstrate how we **import** and **use** this factory function in the notebook.\n", + "3. Confirm it works by instantiating a model and optionally running a quick shape check.\n", + "\n", + "This pattern helps maintain a **clean separation** between model definitions and the logic that decides **which** model to instantiate—making the pipeline easier to scale and adapt for new PDE surrogates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_04 Configuring Hyperparameters\n", + "\n", + "In this section, we outline how to configure the hyperparameters for our PDE surrogate models. \n", + "Recall that we store default values (like `epochs`, `learning_rate`, `batch_size`, etc.) in our\n", + "[`config.yaml`](./config.yaml). \n", + "\n", + "For instance, here are a few default hyperparameters you might see in that file:\n", + "\n", + "| Hyperparameter | Default Value | Description / Notes |\n", + "|---------------------|--------------|-----------------------------------------------------|\n", + "| `training.epochs` | 10 | Number of training epochs |\n", + "| `training.lr` | 1e-3 | Initial learning rate for the optimizer |\n", + "| `training.batch_size` | 16 | Mini-batch size for training loops |\n", + "| `arch.fno.num_fno_modes` | 12 | Number of Fourier modes (FNO-specific) |\n", + "| `arch.afno.drop` | 0.1 | Dropout rate for AFNO gating (AFNO-specific) |\n", + "\n", + "**Overriding Hyperparams Locally** \n", + "You can update these hyperparameters within the notebook before training or tuning. For example:\n", + "```python\n", + "cfg.training.lr = 5e-4\n", + "cfg.training.epochs = 30\n", + "print(\"Updated training config:\", cfg.training)\n", + "```\n", + "\n", + "**Using MLFlow** \n", + "We also demonstrate how to log hyperparameters to MLFlow, so each run’s configuration is \n", + "stored alongside its metrics and artifacts. In a typical flow, you might do:\n", + "\n", + "```python\n", + "import mlflow\n", + "\n", + "mlflow.start_run(run_name=\"Experiment_FNO\")\n", + "# log hyperparams\n", + "log_hyperparams_mlflow(cfg)\n", + "\n", + "# proceed with training...\n", + "mlflow.end_run()\n", + "```\n", + "\n", + "In subsequent cells, we’ll show how to integrate these hyperparameters into the training loop, \n", + "as well as how to override them for AutoML or HPC use cases if you wish. \n", + "This approach ensures a **reproducible** pipeline—where each run can be traced back \n", + "to its exact configuration and settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_05 Model Training Loop\n", + "\n", + "In this section, we implement a **generic PDE training loop** that references our **configuration parameters** (like epochs, learning rate, batch size, etc.) from `config.yaml`. This training loop can be used for:\n", + "\n", + "- **Single-Run Training**: Train a single model with a chosen set of hyperparameters (e.g., an FNO or AFNO).\n", + "- **Multi-Run/AutoML** scenarios: Called multiple times with different hyperparameter overrides for hyperparameter tuning (we’ll see this usage in a later section).\n", + "\n", + "We incorporate:\n", + "- **Progress Bars** with `tqdm`, to get live feedback on training progress (especially helpful in notebooks).\n", + "- **MLFlow Logging** (optional), so each epoch’s train and validation loss is recorded for future analysis.\n", + "- **Device Handling** (CPU vs. GPU via a `device` parameter).\n", + "\n", + "If you’re running on **HPC or distributed** environments, you may want to disable the tqdm progress bars (for performance/logging reasons) and/or integrate distributed managers from Modulus or PyTorch. We’ll point out where those hooks go, but keep them minimal for this prototype.\n", + "\n", + "Below, we’ll demonstrate how to use our training loop, pass in a config object, and see the relevant progress bar and MLFlow logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_06 Model Training Execution\n", + "\n", + "In this section, we bring together all of the moving parts from our pipeline:\n", + "- **Data pipeline**: The raw data has been generated, transformed, and preprocessed in the earlier steps.\n", + "- **Model factory**: We can instantiate our chosen model (e.g., FNO or AFNO) using the config-based logic from “01_03 Model Factory.”\n", + "- **Hyperparameter settings**: From “01_04 Configuring Hyperparameters,” we have default (or overridden) values for epochs, learning rate, batch size, and so on.\n", + "- **Training loop**: As defined in “01_05 Model Training Loop,” which handles epochs, mini-batches, loss calculation, optional validation, and more.\n", + "\n", + "By **combining** these steps, we now present a **user-facing script or function** (`execute_training` or similar) that performs the **end-to-end** training process:\n", + "1. **Pull** the final data loader(s), \n", + "2. **Create** or load the model, \n", + "3. **Train** using our training loop, \n", + "4. **Track** progress in a notebook progress bar (using `tqdm` by default), \n", + "5. **Log** metrics to MLFlow (if desired), \n", + "6. **Save** checkpoints according to the user’s preference (final, best, or every epoch).\n", + "\n", + "We’ll also briefly show how to adjust or disable certain features for HPC usage—such as turning off the progress bar or hooking in distributed training if needed. The remainder of this section walks through a Python function and example usage in the notebook to carry out this consolidated training flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_07 AutoML and Hyperparameter Tuning\n", + "In the previous section (“01_06 Model Training Execution”), we demonstrated how to train our PDE surrogate (FNO or AFNO) with a chosen set of hyperparameters—either from our default `config.yaml` or via simple overrides. Now, we turn to a more **systematic** approach: **hyperparameter tuning** or **AutoML**.\n", + "\n", + "Here, we’ll leverage a search method (grid, random, or Bayesian—commonly **Optuna** in Python) to explore the hyperparameter space. Our `config.yaml` already contains default parameter values and additional fields (under `cfg.automl`) specifying **ranges** (e.g., Fourier modes from 8 to 20, learning rate from 1e-4 to 5e-3, etc.). \n", + "\n", + "**MLFlow Logging** \n", + "Just as in our normal training, we’ll integrate MLFlow to log each hyperparam trial’s configuration and final metrics. By doing so, we can easily compare many trials in a single, consolidated UI. \n", + "\n", + "**Progress Bars** \n", + "For each trial, we can still rely on our PDE training loop’s `tqdm` progress bar—although for a large number of trials, it might be practical to reduce the training epochs or batch sizes to speed up each run.\n", + "\n", + "---\n", + "\n", + "**Key Points in This Section**\n", + "1. **Hyperparameter Range Setup** \n", + " We confirm or update the `config.yaml` sub-tree (`cfg.automl`) that defines the search space for FNO (e.g. `modes`, `width`, `depth`, etc.) and, if relevant, for AFNO (`drop`, `gating_strength`, etc.).\n", + "\n", + "2. **AutoML Logic** \n", + " We’ll create or review a new file, `src/automl.py`, which contains code to parse those search ranges and define an **Optuna objective** function.\n", + "\n", + "3. **Partial vs. Full Training** \n", + " In each trial, we might do a reduced set of epochs or data to expedite the search. Once the best params are found, we’ll do a **full** retraining using the discovered configuration.\n", + "\n", + "4. **MLFlow** \n", + " We’ll log each trial’s hyperparams and final validation metrics under separate nested runs, so you can open MLFlow and compare them.\n", + "\n", + "By the end of this section, you’ll have seen how to run multiple hyperparam search trials—**automatically** adjusting FNO or AFNO parameters—before picking the best discovered setup for a final training pass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### 01_08 Visualizing Performance and Results\n", + "\n", + "After training our PDE surrogate models (and possibly using AutoML to tune hyperparameters),\n", + "we now want to **examine** how well they perform. In this section, we will:\n", + "\n", + "1. **Load** the training/validation metrics from our logs (or MLFlow, if enabled).\n", + "2. **Plot** these metrics (e.g., loss curves over epochs).\n", + "3. **Compare** model predictions to ground-truth solutions for a few test samples—especially\n", + " valuable in Darcy flow, where we can visualize the predicted pressure fields vs. the true\n", + " solution.\n", + "4. **Summarize** errors (e.g., MSE, absolute difference) across a set of test samples to\n", + " get a sense of overall accuracy, variance, and potential failure cases.\n", + "\n", + "We rely on the utility functions we placed in **`src/visualization.py`**:\n", + "\n", + "- `plot_train_val_loss(...)`: For plotting training/validation loss curves.\n", + "- `plot_prediction_comparison(...)`: Side-by-side visualization of **input** (permeability),\n", + " **predicted** (pressure), **ground truth** (pressure), and a simple **error map**.\n", + "- `plot_error_distribution(...)`: Quick histogram or boxplot of errors across many samples.\n", + "- `summarize_metrics_table(...)`: A small table summarizing results from multiple runs.\n", + "\n", + "Finally, we’ll also **load** a saved model checkpoint (if we have one) or pick a final/best-epoch\n", + "checkpoint to run inference on sample PDE inputs. By the end, we should have a clear picture\n", + "of how our model is performing and any areas for improvement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "#### Concluding the Visualization & Pipeline\n", + "We’ve now completed a full pass through our PDE surrogate pipeline—from data preparation, \n", + "model definition, and hyperparameter tuning, to final training and results visualization.\n", + "\n", + "- **Final Observations**: \n", + " - For instance, using FNO with `modes=12` and `width=64` yielded approximately **X%** relative error on the test set. \n", + " - The predicted Darcy fields show close alignment with the ground truth solutions, as seen in our 2D plots.\n", + "\n", + "- **HPC Readiness**: \n", + " - If you plan to run larger resolutions or more epochs, the same notebook logic can scale to HPC environments. \n", + " - You may disable the progress bar or use a distributed manager (e.g., `DistributedManager` in Modulus) to parallelize training.\n", + "\n", + "- **Advanced Features**: \n", + " - In real-world scenarios, consider adding PDE constraints, subgrid modeling, or multi-objective optimization if the use-case demands more advanced physics fidelity.\n", + " - **Active Learning** can be integrated to select new PDE samples, especially if generating or simulating data is expensive.\n", + "\n", + "- **MLFlow or Other Logs**: \n", + " - If you recorded metrics in MLFlow, open the MLFlow UI (or your logging interface) to view interactive charts, parameter comparisons, and artifacts (e.g., model checkpoints, images).\n", + "\n", + "**Next Steps**:\n", + "1. **Refine the Model**: Increase epochs, tweak hyperparameters further, or incorporate additional PDE constraints.\n", + "2. **Deploy or Save** the pipeline: Convert your final model to an inference engine or HPC environment.\n", + "3. **Explore** expansions like deeper AFNO gating, multi-physics PDE coupling, or more advanced domain transformations.\n", + "\n", + "With these steps, you have a **functioning** pipeline that can be adapted for **larger HPC** usage, \n", + "more sophisticated PDE tasks, or integrated with **AutoML** strategies to systematically refine hyperparameters." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cfd/darcy_autoML_active_learning/notebooks/env_utils.py b/examples/cfd/darcy_autoML_active_learning/notebooks/env_utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/pyproject.toml b/examples/cfd/darcy_autoML_active_learning/pyproject.toml new file mode 100644 index 000000000..37491c7ea --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=60.2.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "darcy_automl_active_learning" +version = "0.0.1" +description = "Darcy PDE example with FNO-based autoML and active learning" +authors = [ + { name = "Georg Maerz" } +] +readme = "README.md" +license = { text = "Apache-2.0" } + +dependencies = [ + "optuna==4.1.0", + "mlflow>=2.1.1", + "tqdm>=4.66.5" +] + +[project.urls] +"Source Code" = "https://github.com/YGMaerz/modulus-dls-api" +"Author GitHub" = "https://github.com/YGMaerz" +"Author X" = "https://x.com/YGMaerz" + +[tool.setuptools.packages.find] +# Tells setuptools to look under src/ for your package. +where = ["src"] diff --git a/examples/cfd/darcy_autoML_active_learning/requirements.txt b/examples/cfd/darcy_autoML_active_learning/requirements.txt new file mode 100644 index 000000000..5a43dc158 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/requirements.txt @@ -0,0 +1,3 @@ +mlflow>=2.1.1 +optuna==4.1.0 +tqdm==4.66.5 \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/AutoMLCandidateModelSelection.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/AutoMLCandidateModelSelection.py new file mode 100644 index 000000000..02563f900 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/AutoMLCandidateModelSelection.py @@ -0,0 +1,96 @@ +# File: src/AutoMLCandidateModelSelection.py + +import json +import os +from typing import List, Tuple, Dict, Any + +from .data_desc_logic import load_data_descriptor, check_data_model_compatibility + + +def automl_candidate_model_selection( + data_desc_path: str, + model_descriptors: List[Dict[str, Any]], +) -> List[Tuple[str, str]]: + """ + Loads a PDE dataset descriptor from disk, then checks each model descriptor + against that dataset for compatibility. If compatible, the model is added + to a list of candidate models. + + Args: + data_desc_path (str): + Path to the JSON file describing the PDE dataset + (e.g., dimension, geometry_type, uniform, channels, etc.). + model_descriptors (List[Dict[str, Any]]): + A list of dictionaries, each describing a model's accepted_formats. + Example model descriptor structure: + { + "model_name": "FNO", + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": "grid", + "uniform": True, + "channels_min": 1 + }, + ... + ] + } + + Returns: + List[Tuple[str, str]]: + A list of (model_name, candidate_id) tuples for each compatible model. + For example: [("FNO", "candidate0"), ("AFNO", "candidate1")]. + """ + # 1) Load the dataset descriptor + data_desc = load_data_descriptor(data_desc_path) + + # 2) We'll store chosen models in a list + chosen_candidates = [] + candidate_counter = 0 + + for model_desc in model_descriptors: + model_name = model_desc.get("model_name", "UnknownModel") + + # 3) Check compatibility with the PDE data + is_compatible = check_data_model_compatibility(data_desc, model_desc) + + if is_compatible: + cand_id = f"candidate{candidate_counter}" + chosen_candidates.append((model_name, cand_id)) + candidate_counter += 1 + + return chosen_candidates + + +def save_candidate_models( + candidates: List[Tuple[str, str]], + output_folder: str, + filename: str = "candidate_models.json", +) -> str: + """ + Saves the selected candidate models to a JSON file. + + Args: + candidates (List[Tuple[str, str]]): + A list of (model_name, candidate_id) pairs produced by + automl_candidate_model_selection(...). + output_folder (str): + Path to the folder where the JSON file should be written. + filename (str, optional): + Name of the output JSON file. Default is "candidate_models.json". + + Returns: + str: + The full path to the JSON file that was written. + """ + # 1) Ensure the output folder exists + os.makedirs(output_folder, exist_ok=True) + + # 2) Construct the output file path + json_path = os.path.join(output_folder, filename) + + # 3) Save the candidates list as JSON + with open(json_path, "w") as f: + json.dump(candidates, f, indent=2) + + return json_path diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/__init__.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/__init__.py new file mode 100644 index 000000000..4df812562 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/__init__.py @@ -0,0 +1,4 @@ +""" +This module contains the core functionality for AutoML and data descriptor logic +for the Darcy Flow example. +""" \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/data_desc_logic.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/data_desc_logic.py new file mode 100644 index 000000000..62011fd56 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/data_desc_logic.py @@ -0,0 +1,219 @@ +""" +src/data_desc_logic.py + +This module provides PDE data descriptor loading and model compatibility checks. +It defines: + 1) A constant list of minimal required fields for the data descriptor. + 2) A function `load_data_descriptor(desc_path)` that loads a descriptor from a + JSON file and verifies these required fields under "data_structure". + 3) A function `check_data_model_compatibility(data_desc, model_desc)` that + checks whether the dataset descriptor is compatible with at least one of + the model's accepted data formats (as specified in its descriptor). + +No testing or demo code is included here. Usage examples and tests +should reside in separate modules or notebooks. + +Typical usage: + from data_desc_logic import ( + DATA_DESCRIPTOR_REQUIRED_FIELDS, + load_data_descriptor, + check_data_model_compatibility + ) +""" + +import os +import json + +# 1) Minimal required fields for data descriptor +DATA_DESCRIPTOR_REQUIRED_FIELDS = [ + "dimension", + "geometry_type", + "uniform", + "representation", + "is_transient", + "boundary", + "cell_type", + "decimation", + "channels", + # decimation_level is optional + # coordinate_mapping is optional +] + +def load_data_descriptor(desc_path: str) -> dict: + """ + Load a PDE dataset descriptor from a JSON file and do a basic check + for required fields. + + The descriptor is typically something like: + { + "descriptor_name": "Darcy2D_Uniform_1Ch", + "data_structure": { + "dimension": 2, + "geometry_type": "grid", + "uniform": true, + "representation": {...}, + "is_transient": false, + "boundary": false, + "cell_type": null, + "decimation": false, + "channels": 1 + # Optional: "decimation_level", "coordinate_mapping", etc. + } + } + + Parameters + ---------- + desc_path : str + Path to the JSON descriptor file. + + Returns + ------- + dict + A Python dictionary with the loaded descriptor. It must have a + "data_structure" sub-dict containing the minimal required fields. + + Raises + ------ + FileNotFoundError + If no file is found at the given path. + ValueError + If the descriptor is missing the "data_structure" key or + if one of the minimal required fields is missing in that sub-dict. + + Notes + ----- + - This function does not check geometry or channel compatibility with + any particular model. For that, see `check_data_model_compatibility`. + - The validated dictionary is returned so you can pass it to other + pipeline steps. + """ + if not os.path.isfile(desc_path): + raise FileNotFoundError(f"No descriptor file found at {desc_path}") + + with open(desc_path, "r") as f: + data_desc = json.load(f) + + # The top-level dict must have a 'data_structure' sub-dict + if "data_structure" not in data_desc: + raise ValueError("Missing top-level key 'data_structure' in the JSON descriptor.") + + ds = data_desc["data_structure"] + for field in DATA_DESCRIPTOR_REQUIRED_FIELDS: + if field not in ds: + raise ValueError( + f"Data descriptor is missing required field '{field}' in 'data_structure'." + ) + + return data_desc + + +def check_data_model_compatibility(data_desc: dict, model_desc: dict) -> bool: + """ + Checks whether a given dataset descriptor is compatible with at least one + of the accepted data formats specified by the model descriptor. + + The dataset descriptor is assumed to have been loaded and validated + by `load_data_descriptor`. The model descriptor is a Python dictionary + that typically includes a "model_name" and an "accepted_formats" list, + each format being a dictionary that describes permissible data structure + attributes (dimension, geometry_type, uniform, channels_min, etc.). + + If the data descriptor satisfies all requirements of at least one + accepted format in the model descriptor, this function returns True. + Otherwise, it returns False. + + Parameters + ---------- + data_desc : dict + A Python dictionary representing the dataset descriptor, as + loaded by `load_data_descriptor(...)`. Must contain a + "data_structure" sub-dictionary with fields like "dimension", + "geometry_type", etc. + + model_desc : dict + A Python dictionary describing the model's accepted data formats + under a key "accepted_formats". For example:: + + model_desc = { + "model_name": "FNO", + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": "grid", + "uniform": True, + "channels_min": 1 + } + # ... possibly more accepted formats + ] + } + + Returns + ------- + bool + True if the dataset descriptor is compatible with at least one + of the model's accepted data formats; False otherwise. + + Raises + ------ + KeyError + If the model descriptor lacks an "accepted_formats" key, or if + expected sub-keys are missing within those formats. + + Notes + ----- + - Each format in `model_desc["accepted_formats"]` is compared against + the dataset descriptor's "data_structure" fields: + * dimension -> must be in the accepted list (e.g. [2, 3]) + * geometry_type -> must match exactly + * uniform -> must match exactly + * channels_min -> ensures data_struct["channels"] >= channels_min + Additional constraints can be added as needed. + - This function returns True immediately upon finding the first format + that matches all constraints. Otherwise, it returns False. + """ + ds = data_desc["data_structure"] + if "accepted_formats" not in model_desc: + raise KeyError( + "Model descriptor must contain an 'accepted_formats' key defining supported formats." + ) + + accepted_formats = model_desc["accepted_formats"] + if not isinstance(accepted_formats, list): + raise ValueError( + f"'accepted_formats' must be a list in model descriptor; got {type(accepted_formats)}" + ) + + # Loop over each accepted format + for fmt in accepted_formats: + # dimension check + if "dimension" in fmt: + valid_dims = fmt["dimension"] + if isinstance(valid_dims, list): + if ds["dimension"] not in valid_dims: + continue + else: + # If dimension is not a list, we assume exact match required + if ds["dimension"] != valid_dims: + continue + + # geometry_type check + if "geometry_type" in fmt: + if ds["geometry_type"] != fmt["geometry_type"]: + continue + + # uniform check + if "uniform" in fmt: + if ds["uniform"] != fmt["uniform"]: + continue + + # channels_min check + channels_min = fmt.get("channels_min") + if channels_min is not None: + if ds["channels"] < channels_min: + continue + + # If we get here, all constraints match for this format + return True + + # None of the accepted formats matched + return False diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/env_utils.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/env_utils.py new file mode 100644 index 000000000..cc6073217 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/env_utils.py @@ -0,0 +1,40 @@ +import os +import socket + +def is_docker_env_via_dockerenv(): + return os.path.exists('/.dockerenv') + +def is_docker_env_via_cgroup(): + try: + with open('/proc/1/cgroup', 'rt') as f: + for line in f: + if 'docker' in line or 'kubepods' in line: + return True + except Exception: + pass + return False + +def is_docker_env_via_env_vars(): + docker_env_vars = ['DOCKER', 'CONTAINER', 'KUBERNETES_SERVICE_HOST'] + for var in docker_env_vars: + if os.getenv(var) is not None: + return True + return False + +def is_docker_env_via_hostname(): + hostname = socket.gethostname() + return '.' in hostname # Example heuristic + +def is_running_in_docker(): + return ( + is_docker_env_via_dockerenv() or + is_docker_env_via_cgroup() or + is_docker_env_via_env_vars() or + is_docker_env_via_hostname() + ) + +if __name__ == "__main__": + if is_running_in_docker(): + print("Running inside a Docker container.") + else: + print("Not running inside a Docker container.") diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/__init__.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/AFNO_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/AFNO_descriptor.json new file mode 100644 index 000000000..4e1248d00 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/AFNO_descriptor.json @@ -0,0 +1,150 @@ +{ + "model_name": "AFNO", + "model_family": "FourierOperator", + "model_version": "1.2", + "description": "Adaptive Fourier Neural Operator for PDE surrogates on uniform 2D/3D grids. Learns frequency weighting to handle multi-scale features more flexibly than standard FNO.", + + "implementation": { + "base_class": "modulus.models.afno.AFNO", + "source_repository": "https://github.com/NVIDIA/modulus", + "framework_version": "NVIDIA Modulus 23.09, PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "modulus>=23.09" + ], + "notes": "AFNO is an advanced variant of FNO that adaptively reweighs Fourier modes, which can be beneficial for localized or multi-scale phenomena." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["grid"], + "representations": [ + { + "representation_name": "uniform_grid", + "uniform": true, + "is_voxel_grid": false, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "is_transient_supported": true, + "notes": "Similar data layout to FNO: [N, C, H, W] or [N, C, D, H, W]." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "drop", + "type": "float", + "default": 0.0, + "min": 0.0, + "max": 0.5, + "description": "Dropout rate (if implemented) or drop_rate inside the AFNO blocks." + }, + { + "name": "gating_strength", + "type": "float", + "default": 0.3, + "min": 0.0, + "max": 1.0, + "description": "Scaling factor to modulate adaptive frequency gating. Not always standard in baseline AFNO but used in some variants." + }, + { + "name": "latent_channels", + "type": "int", + "default": 256, + "min": 32, + "max": 1024, + "description": "Dimensionality of the hidden representation (embed_dim)." + }, + { + "name": "afno_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 12, + "description": "Number of AFNO (Fourier) layers." + }, + { + "name": "num_blocks", + "type": "int", + "default": 16, + "min": 4, + "max": 64, + "description": "Number of transform blocks within the AFNO stack." + }, + { + "name": "sparsity_threshold", + "type": "float", + "default": 0.01, + "min": 0.0, + "max": 0.1, + "description": "Threshold below which frequency coefficients are considered negligible." + }, + { + "name": "hard_thresholding_fraction", + "type": "float", + "default": 1.0, + "min": 0.0, + "max": 1.0, + "description": "Fraction of top frequency modes to keep or remove if using a hard-threshold scheme." + }, + { + "name": "dimension", + "type": "int", + "default": 2, + "choices": [2, 3], + "description": "Spatial dimension (2D or 3D)." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.0005, + "batch_size": 4, + "epochs": 400, + "architecture": { + "drop": 0.0, + "gating_strength": 0.3, + "latent_channels": 256, + "afno_layers": 4, + "num_blocks": 16, + "sparsity_threshold": 0.01, + "hard_thresholding_fraction": 1.0, + "dimension": 2 + }, + "regularization": { + "weight_decay": 1e-6 + } + }, + + "constraints": { + "max_resolution": 512, + "gpu_memory_requirements_gb": 12, + "multi_gpu_supported": true, + "distributed_training_supported": true, + "notes": "Adaptive frequency weighting can be slightly more expensive than plain FNO. 3D AFNO especially memory-hungry." + }, + + "metadata": { + "authors": ["NVIDIA Modulus Team"], + "paper_references": [ + "AFNO: Adaptive Fourier Neural Operator (arXiv/preprint)", + "NVIDIA Modulus Docs" + ], + "license": "Proprietary or Apache/MIT", + "citation": "If using AFNO, cite the original FNO paper + the AFNO extension." + }, + + "usage_patterns": { + "best_for": "PDEs with varying frequency content or more localized structures. Adapts across scales via gating.", + "limitations": "Still requires uniform grid. Gains in 3D might be offset by heavy memory usage if resolution is large.", + "recommended_practices": [ + "Increase gating_strength for strongly localized PDE features, lower it for smoother PDE fields.", + "Tune num_blocks and latent_channels carefully to balance accuracy vs. GPU usage.", + "Consider wavelet-based or graph-based operators if domain geometry is non-uniform or unstructured." + ] + } + } diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/DiffusionNet_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/DiffusionNet_descriptor.json new file mode 100644 index 000000000..29946fd3c --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/DiffusionNet_descriptor.json @@ -0,0 +1,125 @@ +{ + "model_name": "DiffusionNet", + "model_family": "GraphOperator", + "model_version": "1.0", + "description": "A graph-like or manifold-based operator that applies diffusion kernels on unstructured meshes. Ideal for PDE problems defined on arbitrary geometry (surfaces or volumes).", + + "implementation": { + "base_class": "my_project.models.diffusionnet.DiffusionNet", + "source_repository": "https://github.com/SomeOrg/diffusionnet", + "framework_version": "PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "numpy>=1.21.0", + "scipy>=1.7.0" + ], + "notes": "Leverages adjacency structures on unstructured meshes to propagate features via diffusion layers. Useful for surfaces or volumes in 2D/3D." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["mesh"], + "representations": [ + { + "representation_name": "unstructured_mesh", + "uniform": false, + "is_voxel_grid": false, + "is_transient_supported": true, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "mesh_type": "surface_or_volume", + "adjacency_required": true, + "notes": "Data typically includes vertices V, faces F, adjacency lists, plus PDE state channels. For PDE boundary conditions, boundary flags or pinned nodes are recommended." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "hidden_dim", + "type": "int", + "default": 128, + "min": 16, + "max": 512, + "description": "Dimensionality of the feature embeddings (graph diffusion layers)." + }, + { + "name": "num_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 16, + "description": "Number of stacked diffusion layers or GNN blocks." + }, + { + "name": "diffusion_steps", + "type": "int", + "default": 1, + "min": 1, + "max": 5, + "description": "Number of local diffusion iterations in each layer. Larger = broader receptive field but heavier compute." + }, + { + "name": "drop", + "type": "float", + "default": 0.0, + "min": 0.0, + "max": 0.5, + "description": "Dropout probability on node features, can help regularize graph training." + }, + { + "name": "activation", + "type": "string", + "default": "relu", + "possible_values": ["relu", "leaky_relu", "gelu"], + "description": "Non-linear activation used between diffusion layers." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.0003, + "batch_size": 2, + "epochs": 250, + "architecture": { + "hidden_dim": 128, + "num_layers": 4, + "diffusion_steps": 1, + "drop": 0.0, + "activation": "relu" + }, + "regularization": { + "weight_decay": 1e-6 + } + }, + + "constraints": { + "max_vertices": 500000, + "gpu_memory_requirements_gb": 12, + "multi_gpu_supported": false, + "distributed_training_supported": false, + "notes": "Large meshes can be memory-intensive. For PDE boundary conditions, boundary nodes must be flagged or pinned. 3D volumetric meshes require more adjacency overhead vs. 2D surfaces." + }, + + "metadata": { + "authors": ["Your Lab/Team", "DiffusionNet Paper Authors"], + "paper_references": [ + "DiffusionNet: A robust architecture for learning on manifolds and meshes", + "arXiv or Journal references" + ], + "license": "Apache 2.0 or similar" + }, + + "usage_patterns": { + "best_for": "Unstructured PDE domains, shape analysis, manifold PDE surrogacy (e.g., fluid flow over complex surfaces).", + "limitations": "Requires adjacency and consistent mesh connectivity. High memory usage if #faces or #vertices is large.", + "recommended_practices": [ + "Provide adjacency or precomputed Laplacian for each mesh if possible—speeds up message passing.", + "Use boundary or pinned-node logic for PDE boundary conditions (can store them in node features or a separate boundary mask).", + "Check mesh quality and ensure no degenerate faces/vertices for stable training." + ] + } +} diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNOWithDropout_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNOWithDropout_descriptor.json new file mode 100644 index 000000000..26b437393 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNOWithDropout_descriptor.json @@ -0,0 +1,116 @@ +{ + "model_name": "FNOWithDropout", + "model_family": "FourierOperator", + "model_version": "1.0", + "description": "An extended Fourier Neural Operator with an nn.Dropout layer, enabling approximate Bayesian uncertainty for active learning.", + "implementation": { + "base_class": "darcy_automl_active_learning.models.fno_with_dropout.FNOWithDropout", + "source_repository": "https://github.com/YourOrg/YourRepo", + "framework_version": "NVIDIA Modulus 23.09, PyTorch 2.0+", + "requirements": [ + "torch>=2.0", + "modulus>=23.09" + ], + "notes": "This class subclasses Modulus FNO to insert dropout after the forward pass." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["grid"], + "representations": [ + { + "representation_name": "uniform_grid", + "uniform": true, + "is_voxel_grid": false, + "is_transient_supported": false, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "mesh_type": null, + "notes": "Same shape requirements as vanilla FNO (e.g. [N, C, H, W] for 2D)." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "dimension", + "type": "int", + "default": 2, + "description": "Spatial dimension (2 or 3)." + }, + { + "name": "in_channels", + "type": "int", + "default": 1, + "description": "Number of input channels." + }, + { + "name": "out_channels", + "type": "int", + "default": 1, + "description": "Number of output channels." + }, + { + "name": "latent_channels", + "type": "int", + "default": 64, + "description": "Width (hidden dimensionality) of the FNO layers." + }, + { + "name": "num_fno_layers", + "type": "int", + "default": 4, + "description": "Depth of FNO in terms of repeated spectral layers." + }, + { + "name": "num_fno_modes", + "type": "int", + "default": 16, + "description": "Number of Fourier modes to keep along each spatial axis." + }, + { + "name": "padding", + "type": "int", + "default": 9, + "description": "Zero-padding for spectral convolutions." + }, + { + "name": "drop", + "type": "float", + "default": 0.1, + "min": 0.0, + "max": 0.7, + "description": "Dropout probability for approximate Bayesian inference." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.001, + "architecture": { + "in_channels": 1, + "out_channels": 1, + "dimension": 2, + "latent_channels": 64, + "num_fno_layers": 4, + "num_fno_modes": 16, + "padding": 9, + "drop": 0.1 + } + }, + + "constraints": { + "max_resolution": 256, + "gpu_memory_requirements_gb": 8, + "multi_gpu_supported": true, + "notes": "Dropout slightly increases memory usage and may slow training." + }, + + "metadata": { + "authors": ["YGMaerz"] + } + } + \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNO_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNO_descriptor.json new file mode 100644 index 000000000..2af6a291f --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/FNO_descriptor.json @@ -0,0 +1,132 @@ +{ + "model_name": "FNO", + "model_family": "FourierOperator", + "model_version": "1.2", + "description": "Fourier Neural Operator for learning PDE surrogates on uniform 2D or 3D grids. Uses global FFT-based layers to capture long-range dependencies.", + + "implementation": { + "base_class": "modulus.models.fno.FNO", + "source_repository": "https://github.com/NVIDIA/modulus", + "framework_version": "NVIDIA Modulus 23.09, PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "modulus>=23.09" + ], + "notes": "Vanilla FNO from NVIDIA Modulus. Does global spectral transforms in each dimension. 3D usage may be memory-intensive." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["grid"], + "representations": [ + { + "representation_name": "uniform_grid", + "uniform": true, + "is_voxel_grid": false, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "is_transient_supported": true, + "notes": "Typical shape: [N, C, H, W] or [N, C, D, H, W]. If PDE boundaries matter, user can add a boundary mask channel, but not mandatory." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "drop", + "type": "float", + "default": 0.1, + "min": 0.0, + "max": 0.5, + "description": "Dropout rate applied after FNO layers." + }, + { + "name": "fno_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 12, + "description": "Number of FNO blocks/layers." + }, + { + "name": "num_fno_modes", + "type": "int", + "default": 16, + "min": 8, + "max": 64, + "description": "Number of Fourier modes in each spatial dimension." + }, + { + "name": "latent_channels", + "type": "int", + "default": 64, + "min": 16, + "max": 256, + "description": "Hidden channel dimension for the FNO layers." + }, + { + "name": "dimension", + "type": "int", + "default": 2, + "choices": [2, 3], + "description": "Spatial dimension for the FNO (2D or 3D)." + }, + { + "name": "padding", + "type": "int", + "default": 9, + "min": 0, + "max": 20, + "description": "Padding in real space before FFT." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.0005, + "batch_size": 4, + "epochs": 400, + "architecture": { + "drop": 0.1, + "fno_layers": 4, + "num_fno_modes": 16, + "latent_channels": 64, + "dimension": 2, + "padding": 9 + }, + "regularization": { + "weight_decay": 1e-6 + } + }, + + "constraints": { + "max_resolution": 256, + "gpu_memory_requirements_gb": 12, + "multi_gpu_supported": true, + "distributed_training_supported": true, + "notes": "3D FNO can require large memory. Ensure you have enough GPU for high resolution." + }, + + "metadata": { + "authors": ["NVIDIA Modulus Team"], + "paper_references": [ + "Z. Li et al., Fourier Neural Operator for Parametric PDEs (ICLR 2021)", + "NVIDIA Modulus Docs: https://docs.nvidia.com/deeplearning/modulus" + ], + "license": "Proprietary or Apache/MIT", + "citation": "If you use FNO, please cite the original FNO paper + NVIDIA Modulus." + }, + + "usage_patterns": { + "best_for": "Parametric PDE families on uniform grids. Good for large-scale 2D or moderate 3D PDEs.", + "limitations": "Does not explicitly handle unstructured meshes or highly non-uniform grids. For that, see NuFNO or graph-based methods.", + "recommended_practices": [ + "Use a moderate number of Fourier modes (16–32) for 2D problems.", + "Consider dropout if data is limited or prone to overfitting.", + "For 3D, watch out for GPU memory usage and scale up gradually." + ] + } + } \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/GraphCast_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/GraphCast_descriptor.json new file mode 100644 index 000000000..1cfb0b09c --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/GraphCast_descriptor.json @@ -0,0 +1,123 @@ +{ + "model_name": "GraphCast", + "model_family": "GraphOperator", + "model_version": "1.0", + "description": "Large-scale GNN approach designed for climate/weather PDE data on unstructured or geodesic meshes. Incorporates message passing and temporal updates to handle planet-scale forecasting or similar PDE-based phenomena.", + + "implementation": { + "base_class": "my_project.models.graphcast.GraphCastModel", + "source_repository": "https://github.com/deepmind/graphcast // or your forked/modified version", + "framework_version": "PyTorch Geometric 2.x, PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "torch_geometric>=2.3", + "numpy>=1.20" + ], + "notes": "Original GraphCast targets global weather data on a geodesic sphere. Requires adjacency and node-edge features." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["mesh"], + "representations": [ + { + "representation_name": "spherical_geodesic", + "uniform": false, + "is_voxel_grid": false, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "is_transient_supported": true, + "adjacency_required": true, + "notes": "Typically stores node coords on a sphere plus edge connectivity. Also works for other unstructured or mesh-based domains." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "hidden_dim", + "type": "int", + "default": 256, + "min": 32, + "max": 1024, + "description": "Dimensionality of node (and possibly edge) embeddings in the GNN." + }, + { + "name": "num_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 16, + "description": "Number of graph message-passing layers or blocks." + }, + { + "name": "drop", + "type": "float", + "default": 0.1, + "min": 0.0, + "max": 0.5, + "description": "Dropout rate applied in GNN layers to mitigate overfitting." + }, + { + "name": "temporal_updates", + "type": "bool", + "default": true, + "description": "Whether GraphCast includes a recurrent or sequential update step for time-evolving PDE data." + }, + { + "name": "aggregator_type", + "type": "string", + "default": "mean", + "choices": ["mean", "sum", "max", "attention"], + "description": "Aggregation function used in message passing." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.0003, + "batch_size": 2, + "epochs": 300, + "architecture": { + "hidden_dim": 256, + "num_layers": 4, + "drop": 0.1, + "temporal_updates": true, + "aggregator_type": "mean" + }, + "regularization": { + "weight_decay": 1e-5 + } + }, + + "constraints": { + "max_resolution": null, + "gpu_memory_requirements_gb": 16, + "multi_gpu_supported": true, + "distributed_training_supported": true, + "notes": "Huge spherical meshes (e.g., 1M+ nodes) can require multi-GPU setups. GraphCast typically used for large-scale climate PDEs or high-res geodesic grids." + }, + + "metadata": { + "authors": ["DeepMind team", "Collaborators"], + "paper_references": [ + "GraphCast: Learning skillful weather forecasting with global Earth observation data (DeepMind 2022)", + "https://arxiv.org/abs/2212.12794" + ], + "license": "Proprietary or Apache/MIT", + "citation": "If using GraphCast or derivative, cite the original paper and repository." + }, + + "usage_patterns": { + "best_for": "Global or regional PDE data on spherical geodesic meshes (like climate and weather). Also generalizable to other unstructured meshes.", + "limitations": "Must provide adjacency or edge index. Memory-heavy at scale. Data must be partitioned or batched effectively for large node counts.", + "recommended_practices": [ + "Ensure node-level features (like lat/lon, elevation) are included as part of the input channels.", + "Use aggregator='attention' if your PDE problem benefits from attention-based local weighting.", + "Check GPU memory usage carefully if dealing with >1 million nodes." + ] + } +} diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/NuFNO_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/NuFNO_descriptor.json new file mode 100644 index 000000000..47f7f6161 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/NuFNO_descriptor.json @@ -0,0 +1,136 @@ +{ + "model_name": "NuFNO", + "model_family": "FourierOperator", + "model_version": "1.0", + "description": "Non-Uniform Fourier Neural Operator. Extends standard FNO to handle partially non-uniform meshes or coordinate-encoded grids, often by encoding spatial coordinates explicitly.", + + "implementation": { + "base_class": "my_project.models.nufno.NuFNO", + "source_repository": "https://github.com/YourOrg/nufno // or reference the relevant library/repo", + "framework_version": "PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "modulus>=23.09", + "numpy>=1.21.0" + ], + "notes": "Coordinates are typically learned or embedded to handle partial non-uniformity. Still uses Fourier layers but on re-parameterized input domain." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["grid"], + "representations": [ + { + "representation_name": "non_uniform_grid", + "uniform": false, + "is_voxel_grid": false, + "is_transient_supported": true, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "mesh_type": null, + "notes": "NuFNO applies FFT-like operations on a coordinate-encoded domain. Input shape might be [N, (H*W), channels] or [N, H, W, channels] with coordinate embeddings." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "coord_encoding_dim", + "type": "int", + "default": 2, + "min": 1, + "max": 3, + "description": "Dimension of the coordinate embedding for partial non-uniform domain. For 2D PDEs, typical is 2." + }, + { + "name": "embed_layer_width", + "type": "int", + "default": 64, + "min": 16, + "max": 1024, + "description": "Width of the MLP or embedding layer that transforms raw (x,y) coordinates into a higher-dimensional latent vector." + }, + { + "name": "latent_channels", + "type": "int", + "default": 64, + "min": 16, + "max": 512, + "description": "Number of channels in the Fourier transform layers after coordinate embedding." + }, + { + "name": "num_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 12, + "description": "Depth of the NuFNO stack—i.e., how many consecutive Fourier layers are used after coordinate encoding." + }, + { + "name": "num_modes", + "type": "int", + "default": 16, + "min": 1, + "max": 32, + "description": "Number of Fourier modes or frequency components retained in each layer." + }, + { + "name": "drop", + "type": "float", + "default": 0.0, + "min": 0.0, + "max": 0.5, + "description": "Dropout probability for the main layers (similar to FNO)." + } + ], + + "default_hyperparams": { + "optimizer": "AdamW", + "learning_rate": 0.0005, + "batch_size": 4, + "epochs": 300, + "architecture": { + "coord_encoding_dim": 2, + "embed_layer_width": 64, + "latent_channels": 64, + "num_layers": 4, + "num_modes": 16, + "drop": 0.0 + }, + "regularization": { + "weight_decay": 1e-6 + } + }, + + "constraints": { + "max_resolution": 512, + "gpu_memory_requirements_gb": 10, + "multi_gpu_supported": false, + "distributed_training_supported": false, + "notes": "Non-uniform expansions can be memory-heavy for large or highly irregular domains. Typically suitable for moderate PDE sizes." + }, + + "metadata": { + "authors": ["Research Lab X", "Your Team", "Collaborators"], + "paper_references": [ + "NuFNO: Non-Uniform Fourier Neural Operator Paper/Preprint", + "NVIDIA Modulus or other partial references if integrated" + ], + "license": "MIT or Similar", + "citation": "Please cite the NuFNO paper if you use this model in research." + }, + + "usage_patterns": { + "best_for": "PDE problems with partial non-uniform grids where coordinate encoding can approximate the domain geometry.", + "limitations": "Still not fully unstructured (like a mesh). Typically requires a 'grid-like' connectivity, albeit non-uniform spacing. 3D usage can be large memory.", + "recommended_practices": [ + "Embed the (x, y) coordinates via a small MLP or sinusoidal encoding before applying the Fourier layers.", + "Tune num_modes carefully for partially irregular domains. Too many modes can lead to overfitting or high compute cost.", + "Inspect coordinate coverage—NuFNO still expects fairly regular domain coverage, not purely random point clouds." + ] + } + } + \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/WNO_descriptor.json b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/WNO_descriptor.json new file mode 100644 index 000000000..b1d2650e2 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/descriptors/WNO_descriptor.json @@ -0,0 +1,125 @@ +{ + "model_name": "WNO", + "model_family": "WaveletOperator", + "model_version": "1.0", + "description": "Wavelet Neural Operator for multi-scale PDE fields. Replaces global FFT with wavelet transforms to capture localized features. Typically used on uniform 2D/3D grids.", + + "implementation": { + "base_class": "my_project.models.wno.WaveletNeuralOperator", + "source_repository": "https://github.com/YourOrg/wno // or reference any wavelet-based PDE operator repo", + "framework_version": "PyTorch 2.0", + "requirements": [ + "torch>=2.0", + "pywavelets>=1.4.0" + ], + "notes": "Assumes data is in uniform grid layout so wavelet transforms can be applied along each spatial dimension." + }, + + "accepted_formats": [ + { + "dimension": [2, 3], + "geometry_type": ["grid"], + "representations": [ + { + "representation_name": "uniform_grid", + "uniform": true, + "is_voxel_grid": false, + "is_transient_supported": true, + "channels_min": 1, + "channels_max": null, + "boundary_required": false, + "mesh_type": null, + "notes": "Wavelet transform performed along spatial dimensions. Data shape typically [N, H, W, C] or [N, D, H, W, C]." + } + ] + } + ], + + "hyperparams_schema": [ + { + "name": "wavelet_type", + "type": "string", + "default": "haar", + "choices": ["haar", "db2", "db4", "sym4", "coif1"], + "description": "Wavelet basis for decomposition (Haar, Daubechies, Symlets, Coiflets, etc.)." + }, + { + "name": "levels", + "type": "int", + "default": 3, + "min": 1, + "max": 6, + "description": "Number of wavelet decomposition levels. Higher levels capture more coarse features but increase computation/memory." + }, + { + "name": "hidden_channels", + "type": "int", + "default": 64, + "min": 16, + "max": 512, + "description": "Dimension of the internal channels after wavelet transforms. Similar to 'width' in FNO contexts." + }, + { + "name": "num_wno_layers", + "type": "int", + "default": 4, + "min": 1, + "max": 10, + "description": "Depth of wavelet operator layers. Each layer typically does wavelet decomposition, filtering, inverse wavelet transform." + }, + { + "name": "drop", + "type": "float", + "default": 0.0, + "min": 0.0, + "max": 0.5, + "description": "Dropout rate for wavelet operator layers, if used to reduce overfitting." + } + ], + + "default_hyperparams": { + "optimizer": "Adam", + "learning_rate": 0.0003, + "batch_size": 4, + "epochs": 300, + "architecture": { + "wavelet_type": "haar", + "levels": 3, + "hidden_channels": 64, + "num_wno_layers": 4, + "drop": 0.0 + }, + "regularization": { + "weight_decay": 1e-5 + } + }, + + "constraints": { + "max_resolution": 512, + "gpu_memory_requirements_gb": 8, + "multi_gpu_supported": false, + "distributed_training_supported": false, + "notes": "3D wavelet transforms can be memory-intensive. Typically used for 2D PDE tasks, but 3D usage possible with large GPU memory." + }, + + "metadata": { + "authors": ["Research Lab X", "Your Team", "Contributors"], + "paper_references": [ + "Wavelet Neural Operator (WNO) Paper/Preprint", + "PyTorch-based wavelet transform references" + ], + "license": "MIT or Proprietary", + "citation": "Please cite the WNO paper if you use this model in research." + }, + + "usage_patterns": { + "best_for": "Multi-scale PDE problems with local features (shocks, boundary layers) that benefit from wavelet decomposition.", + "limitations": "Requires uniform grid data for straightforward wavelet transforms. Non-uniform grids need re-sampling or specialized wavelet schemes.", + "recommended_practices": [ + "Choose wavelet_type carefully; Haar is simplest, Daubechies can capture smoother transitions.", + "Experiment with levels=3 or 4 for moderate PDE complexity; deeper wavelet stacks can slow training.", + "Monitor GPU memory usage if applying 3D wavelet transforms with high resolution." + ] + } + } + \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/model_registry.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/model_registry.py new file mode 100644 index 000000000..c9e3f81c8 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_registry/model_registry.py @@ -0,0 +1,143 @@ +# File: src/darcy_automl_active_learning/model_registry/model_registry.py + +import os +import json +import logging +from typing import Dict, Any, Optional, List + +logger = logging.getLogger(__name__) + +class ModelRegistry: + """ + The ModelRegistry manages a collection of model descriptors, + each specifying important metadata (base_class, accepted_formats, + default_hyperparams, HPC constraints, etc.). + + If no 'descriptors_file' is specified, the registry automatically + scans a default folder (DEFAULT_DESCRIPTOR_DIR) for .json files, + each expected to contain exactly one model descriptor. The + loaded descriptors can then be queried or retrieved by name. + """ + + DEFAULT_DESCRIPTOR_DIR = os.path.join( + os.path.dirname(__file__), # current folder: model_registry + "descriptors" # subfolder + ) + + def __init__(self, descriptors_file: Optional[str] = None): + """ + If descriptors_file is provided, loads that single JSON file. + Otherwise, scans DEFAULT_DESCRIPTOR_DIR for all *.json files + and loads each as a model descriptor. + + :param descriptors_file: Path to a single JSON file containing model descriptor(s), + or None to load from /model_registry/descriptors/ by default. + """ + self._descriptors: Dict[str, Dict[str, Any]] = {} + + if descriptors_file: + # If user explicitly provided a single file + self.load_descriptors(descriptors_file) + else: + # Otherwise, load from the default descriptors folder + if os.path.isdir(self.DEFAULT_DESCRIPTOR_DIR): + self.load_all_descriptors_in_folder(self.DEFAULT_DESCRIPTOR_DIR) + else: + # Optionally raise or just log a warning + logger.debug( + "[ModelRegistry] WARNING: No descriptors_file provided and default folder " + f"'{self.DEFAULT_DESCRIPTOR_DIR}' not found. No models loaded." + ) + + def load_descriptors(self, file_path: str) -> None: + if not os.path.isfile(file_path): + raise FileNotFoundError(f"[ModelRegistry] No file found at {file_path}") + + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + if "model_name" in data: + model_key = data["model_name"] + self._descriptors[model_key] = data + logger.debug( + "[ModelRegistry] Loaded descriptor for '%s' from %s", + model_key, file_path + ) + else: + raise ValueError(f"[ModelRegistry] JSON at {file_path} missing 'model_name'.") + + def load_all_descriptors_in_folder(self, folder_path: str) -> None: + """ + Scans a folder for .json files, calling load_descriptors(...) on each. + + :param folder_path: path to a directory containing .json descriptor files + """ + json_files = [ + f for f in os.listdir(folder_path) + if f.endswith(".json") and os.path.isfile(os.path.join(folder_path, f)) + ] + + if not json_files: + logger.debug( + "[ModelRegistry] WARNING: No .json files found in %s. No models loaded.", + folder_path + ) + return + + for json_fname in sorted(json_files): + file_path = os.path.join(folder_path, json_fname) + try: + self.load_descriptors(file_path) + except Exception as e: + logger.debug( + "[ModelRegistry] ERROR: Could not load %s: %s", + json_fname, e + ) + + def get_descriptor(self, model_name: str) -> Dict[str, Any]: + """ + Retrieve the descriptor dictionary for a given model name. + + :param model_name: The model name key (e.g., "FNO", "AFNO", "DiffusionNet"). + :return: The descriptor dictionary (e.g., with keys: "description", "base_class", etc.). + :raises KeyError: If the model_name is not found in the registry. + """ + if model_name not in self._descriptors: + raise KeyError(f"[ModelRegistry] Model '{model_name}' not found in registry.") + return self._descriptors[model_name] + + def get_all_descriptors(self) -> Dict[str, Dict[str, Any]]: + """ + Return a dictionary of all loaded model descriptors. + Keys are model names, values are the descriptor dictionaries. + + :return: { "FNO": {...}, "AFNO": {...}, ... } + """ + return self._descriptors + + def register_model_descriptor(self, descriptor: Dict[str, Any]) -> None: + """ + Allows adding a new model descriptor at runtime. + + :param descriptor: A dictionary with at least a "model_name" key. + """ + if "model_name" not in descriptor: + raise ValueError("[ModelRegistry] Descriptor must contain 'model_name' field.") + model_key = descriptor["model_name"] + self._descriptors[model_key] = descriptor + logger.debug("[ModelRegistry] Registered new descriptor for model '%s'.", model_key) + + def model_exists(self, model_name: str) -> bool: + """ + Quick check to see if a given model_name is in the registry. + + :param model_name: e.g., "FNO", "AFNO", "GraphCast", etc. + :return: True if the registry contains it, else False. + """ + return model_name in self._descriptors + + def list_models(self) -> List[str]: + """ + Return a list of all model names in the registry. + """ + return list(self._descriptors.keys()) diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/__init__.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/candidate_selector.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/candidate_selector.py new file mode 100644 index 000000000..1fcdc6fb6 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/candidate_selector.py @@ -0,0 +1,107 @@ +# src/darcy_automl_active_learning/model_selection/candidate_selector.py + +import os +import json +from typing import List, Tuple +from darcy_automl_active_learning.model_registry.model_registry import ModelRegistry +from .selection_strategies import BaseSelectionStrategy + +class CandidateModelSelector: + """ + CandidateModelSelector orchestrates: + 1) Validation of a data descriptor + 2) AutoML-based candidate model selection (per chosen strategy) + 3) Retrieval of required data structures for each model + """ + + def __init__(self, model_registry: ModelRegistry, selection_strategy: BaseSelectionStrategy): + """ + Args: + model_registry : Instance of ModelRegistry + selection_strategy : A strategy implementing how to pick candidate models + """ + self.model_registry = model_registry + self.selection_strategy = selection_strategy + + def validate_data_descriptor(self, data_desc_path: str) -> bool: + """ + Basic placeholder for data descriptor validation. + In a real scenario, you'd parse the JSON, check required fields, etc. + + Returns: + bool : True if descriptor is valid, else False + """ + if not os.path.isfile(data_desc_path): + print(f"[CandidateModelSelector] Data descriptor not found: {data_desc_path}") + return False + + with open(data_desc_path, "r") as f: + data_desc = json.load(f) + + # Example logic: must have "data_structure" key + if "data_structure" not in data_desc: + print("[CandidateModelSelector] 'data_structure' key missing in descriptor.") + return False + + # Additional checks if desired... + return True + + def automl_candidate_model_selection(self, data_desc_path: str) -> List[Tuple[str, str]]: + """ + Invokes the selection strategy to pick suitable models. + + Args: + data_desc_path (str): path to the data descriptor JSON + + Returns: + List[Tuple[str, str]]: e.g. [("FNO", "candidate0"), ("AFNO", "candidate1")] + """ + with open(data_desc_path, "r") as f: + data_desc = json.load(f) + + # Let the strategy do the work + selected_candidates = self.selection_strategy.select_candidates( + data_desc=data_desc, + model_registry=self.model_registry + ) + + return selected_candidates + + def get_required_data_structure(self, model_name: str): + """ + Retrieve the data structure requirements from the model descriptor + (the 'accepted_formats' or something similar). + + Args: + model_name (str): Name of the model, must exist in the registry + + Returns: + dict or list: The portion of the descriptor describing required input format + """ + descriptor = self.model_registry.get_descriptor(model_name) + + # Typically, you'd parse descriptor["accepted_formats"] or similar + return descriptor.get("accepted_formats", []) + + def save_candidate_models(self, selected_candidates: List[Tuple[str, str]], output_folder: str) -> str: + """ + Saves the chosen models (along with their candidate keys) to a JSON file. + + Args: + selected_candidates: e.g. [("FNO","candidate0"), ("AFNO","candidate1")] + output_folder: location to store the JSON file + + Returns: + str : path to the saved JSON + """ + os.makedirs(output_folder, exist_ok=True) + output_path = os.path.join(output_folder, "chosen_candidates.json") + + # Convert to a JSON-friendly structure + # e.g. [ ["FNO","candidate0"], ["DiffusionNet","candidate1"] ] + with open(output_path, "w") as f: + json.dump(selected_candidates, f, indent=2) + + print(f"[CandidateModelSelector] Saved {len(selected_candidates)} candidates to: {output_path}") + return output_path + \ No newline at end of file diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/selection_strategies.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/selection_strategies.py new file mode 100644 index 000000000..9e1997542 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/model_selection/selection_strategies.py @@ -0,0 +1,40 @@ +# src/darcy_automl_active_learning/model_selection/selection_strategies.py + +from abc import ABC, abstractmethod +from typing import List, Tuple, Any + +class BaseSelectionStrategy(ABC): + """ + Base class for different model selection strategies. + """ + + @abstractmethod + def select_candidates(self, data_desc: dict, model_registry: Any) -> List[Tuple[str, str]]: + """ + Return a list of (model_name, candidate_key) pairs based on data_desc. + + Args: + data_desc (dict): The dataset descriptor loaded from JSON + model_registry (ModelRegistry or a generic type): + Provides access to model descriptors + + Returns: + List[Tuple[str,str]]: e.g. [("FNO","candidate0"), ("AFNO","candidate1")] + """ + pass + + +class SimpleSelectionStrategy(BaseSelectionStrategy): + """ + An example strategy that always picks FNO, or picks a small set of models + regardless of data. Extend as needed for real logic. + """ + + def select_candidates(self, data_desc: dict, model_registry: Any) -> List[Tuple[str, str]]: + # A trivial approach: always select "FNO" as candidate0 + # Note: you could parse data_desc["data_structure"] to check dimension, etc. + + # Possibly do: if dimension=2 and geometry=grid => pick FNO, else pick DiffusionNet + # For now, a simple example: + selected = [("FNOWithDropout", "candidate0"),("AFNO", "candidate1")] + return selected diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/models/__init__.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/models/fno_with_dropout.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/models/fno_with_dropout.py new file mode 100644 index 000000000..6881f5d6a --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/models/fno_with_dropout.py @@ -0,0 +1,74 @@ +""" +/models/fno_with_dropout.py + +This file provides PDE surrogate models referencing config fields such as: + cfg.arch.fno.* and cfg.arch.afno.*. + +Classes: + FNOWithDropout: Subclass of Modulus FNO that adds a dropout layer + (Optional) AFNO or a wrapper for AFNO if needed. +""" + +import torch +import torch.nn as nn +from modulus.models.fno import FNO +from modulus.models.afno import AFNO + +class FNOWithDropout(FNO): + """ + A subclass of Modulus's FNO to include a dropout layer, referencing cfg.arch.fno.*. + + Typical config usage: + cfg.arch.fno.in_channels: int + cfg.arch.fno.out_channels: int + cfg.arch.fno.dimension: int (usually 2 or 3) + cfg.arch.fno.latent_channels: int (width of the hidden representation) + cfg.arch.fno.num_fno_layers: int (depth) + cfg.arch.fno.num_fno_modes: int (Fourier modes) + cfg.arch.fno.padding: int + cfg.arch.fno.drop: float (dropout probability, e.g. 0.1) + + Example: + model = FNOWithDropout( + drop=cfg.arch.fno.drop, + in_channels=cfg.arch.fno.in_channels, + out_channels=cfg.arch.fno.out_channels, + dimension=cfg.arch.fno.dimension, + latent_channels=cfg.arch.fno.latent_channels, + num_fno_layers=cfg.arch.fno.fno_layers, + num_fno_modes=cfg.arch.fno.num_fno_modes, + padding=cfg.arch.fno.padding, + ) + """ + + def __init__(self, drop=0.1, *args, **kwargs): + """ + Initialize the dropout-enabled FNO. + + Args: + drop (float): Dropout probability (default=0.1). + *args, **kwargs: Passed through to the base FNO constructor + (e.g., in_channels, out_channels, dimension, etc.). + """ + super().__init__(*args, **kwargs) + self.drop = drop + # Insert a dropout layer after the base FNO forward pass: + self.dropout_layer = nn.Dropout(p=self.drop) + + def forward(self, x): + """ + Forward pass. Calls the parent FNO forward, then applies dropout. + + Args: + x (torch.Tensor): + Input of shape [batch_size, in_channels, ...] + E.g. for 2D Darcy, [B, 1, H, W]. + + Returns: + (torch.Tensor): Output of shape [batch_size, out_channels, ...]. + """ + # 1) Standard FNO forward pass + out = super().forward(x) + # 2) Apply dropout + out = self.dropout_layer(out) + return out diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/__init__.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_engine.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_engine.py new file mode 100644 index 000000000..de6c70ed4 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_engine.py @@ -0,0 +1,275 @@ +import os +from typing import Dict, Any + +class OntologyEngine: + """ + A mock 'OntologyEngine' that returns a transformation plan for exactly one candidate, + based on model_name ("FNO", "FNOWithDropout", or "AFNO") and candidate_key (e.g. "candidate0"). + + It embeds data_dir_path in each transform_op's params, along with optional subfolder_source + and subfolder_dest for stages 3, 4, and 5. If model_name is unrecognized, raises NotImplementedError. + """ + + def __init__(self): + pass + + def suggest_transformations( + self, + source_data_desc: Dict[str, Any], + target_data_requirements: Dict[str, Any], + model_name: str, + candidate_key: str, + data_dir_path: str = "data" + ) -> Dict[str, Any]: + """ + Build and return a transformation plan *for one candidate* (one model_name + candidate_key). + + Args: + source_data_desc: The PDE data descriptor (e.g., data_desc["data_structure"]). + target_data_requirements: E.g., from candidate_selector.get_required_data_structure(model_name). + model_name: "FNO", "FNOWithDropout", or "AFNO". + candidate_key: A unique identifier, e.g. "candidate0". + data_dir_path: Base directory path (string); embedded in each transform_op. + + Returns: + A dictionary with shape: + { + "model_name": "", + "stages": [ + { + "stage_name": "01_01_01_LoadRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "...", + "dest_folder": "...", + "subfolder_source": "...", (optional) + "subfolder_dest": "...", (optional) + "data_dir_path": "..." (string) + } + }, + ... + ] + }, + ... + ] + } + + Raises: + NotImplementedError: If model_name is not one of ("FNO", "FNOWithDropout", "AFNO"). + """ + + # Convert to string (avoid JSON serialization problems if it's a Path) + data_dir_path = str(data_dir_path) + + # Hard-coded plan for "FNO" + plan_for_fno = { + "model_name": "FNO", + "stages": [ + { + "stage_name": "01_01_01_LoadRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "00_Generate_Data", + "dest_folder": "01_01_LoadRawData", + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_03_TransformRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_LoadRawData", + "dest_folder": "01_01_03_TransformRawData", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_04_Preprocessing", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_03_TransformRawData", + "dest_folder": "01_01_04_Preprocessing", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_05_FeaturePreparation", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_04_Preprocessing", + "dest_folder": "01_01_05_FeaturePreparation", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + } + ] + } + + # Hard-coded plan for "FNOWithDropout" (same stages, different top-level model_name) + plan_for_fno_with_dropout = { + "model_name": "FNOWithDropout", + "stages": [ + { + "stage_name": "01_01_01_LoadRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "00_Generate_Data", + "dest_folder": "01_01_LoadRawData", + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_03_TransformRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_LoadRawData", + "dest_folder": "01_01_03_TransformRawData", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_04_Preprocessing", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_03_TransformRawData", + "dest_folder": "01_01_04_Preprocessing", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_05_FeaturePreparation", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_04_Preprocessing", + "dest_folder": "01_01_05_FeaturePreparation", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + } + ] + } + + # Hard-coded plan for "AFNO" + plan_for_afno = { + "model_name": "AFNO", + "stages": [ + { + "stage_name": "01_01_01_LoadRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "00_Generate_Data", + "dest_folder": "01_01_LoadRawData", + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_03_TransformRawData", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_LoadRawData", + "dest_folder": "01_01_03_TransformRawData", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_04_Preprocessing", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_03_TransformRawData", + "dest_folder": "01_01_04_Preprocessing", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + }, + { + "stage_name": "01_01_05_FeaturePreparation", + "transform_ops": [ + { + "method": "copy_only", + "params": { + "source_folder": "01_01_04_Preprocessing", + "dest_folder": "01_01_05_FeaturePreparation", + "subfolder_source": candidate_key, + "subfolder_dest": candidate_key, + "data_dir_path": data_dir_path + } + } + ] + } + ] + } + + # Decide which plan to return based on model_name + if model_name == "FNO": + return plan_for_fno + elif model_name == "FNOWithDropout": + return plan_for_fno_with_dropout + elif model_name == "AFNO": + return plan_for_afno + else: + raise NotImplementedError( + f"[OntologyEngine] Model '{model_name}' is not implemented. " + "Available options: ['FNO', 'FNOWithDropout', 'AFNO']." + ) diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_transformation_engine.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_transformation_engine.py new file mode 100644 index 000000000..a8ca13318 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/ontology/ontology_transformation_engine.py @@ -0,0 +1,238 @@ +import os +import shutil +import glob +from pathlib import Path +from typing import Optional + +class OntologyTransformationEngine: + """ + The OntologyTransformationEngine provides a collection of methods (transformations) + to modify PDE data sets (e.g., .pt files, mesh files) in ways that align with + different candidate model requirements. The 'OntologyEngine' or 'transformation plan' + can direct which of these methods to call for each pipeline stage and candidate. + + Example usage: + engine = OntologyTransformationEngine() + # Suppose a plan stage has: + # { + # "method": "copy_only", + # "params": { + # "source_folder": "01_01_LoadRawData", + # "dest_folder": "01_01_03_TransformRawData", + # "subfolder_source": "candidate0", + # "subfolder_dest": "candidate0", + # "data_dir_path": "examples/cfd/darcy_autoML_active_learning/data" + # } + # } + # This will copy from /01_01_LoadRawData/candidate0 + # into /01_01_03_TransformRawData/candidate0 + """ + + def __init__(self): + """ + Initialize any resources or configurations needed by the engine. + """ + pass + + # ------------------------------------------------------------------------ + # 1) COPY_ONLY + # ------------------------------------------------------------------------ + def copy_only(self, + source_folder: str, + dest_folder: str, + data_dir_path: Optional[str] = None, + subfolder_source: Optional[str] = None, + subfolder_dest: Optional[str] = None, + **kwargs) -> None: + """ + Copies all relevant data files (e.g., .pt, .json) from source_folder to dest_folder. + + If data_dir_path is provided, both source_folder and dest_folder will be treated as + relative to that base path. If subfolder_source or subfolder_dest is provided, those + subfolders are appended to the final source or destination path, respectively. + + :param source_folder: Directory containing the files to copy (relative or absolute). + :param dest_folder: Directory where files should be placed (relative or absolute). + :param data_dir_path: Optional base directory path. If provided, source/dest paths + are joined relative to this path. If None, the paths are used + as given. + :param subfolder_source: (Optional) Additional subfolder appended to source_folder. + :param subfolder_dest: (Optional) Additional subfolder appended to dest_folder. + :param kwargs: Placeholder for any unused params from the JSON plan + (so we don't raise unexpected-arg errors). + """ + # 1) Resolve base paths + if data_dir_path is not None: + base_path = Path(data_dir_path) + + # If source_folder is not absolute, prepend data_dir_path + sf_path = Path(source_folder) + if not sf_path.is_absolute(): + sf_path = base_path / sf_path + + # If dest_folder is not absolute, prepend data_dir_path + df_path = Path(dest_folder) + if not df_path.is_absolute(): + df_path = base_path / df_path + else: + # Use the folders exactly as provided + sf_path = Path(source_folder) + df_path = Path(dest_folder) + + # 2) Append subfolders if provided + if subfolder_source: + sf_path = sf_path / subfolder_source + if subfolder_dest: + df_path = df_path / subfolder_dest + + # 3) Create the destination directory if it doesn't exist + os.makedirs(df_path, exist_ok=True) + + # 4) Example: copy all .pt files and data_desc.json if it exists + patterns = ["*.pt", "data_desc.json"] + for pattern in patterns: + for file_path in sf_path.glob(pattern): + fname = file_path.name + dest_path = df_path / fname + shutil.copy2(file_path, dest_path) + + print(f"[OntologyTransformationEngine] COPY_ONLY done: {sf_path} -> {df_path}") + + # ------------------------------------------------------------------------ + # 2) TRANSFORM_MESH_TO_GRID (placeholder) + # ------------------------------------------------------------------------ + def transform_mesh_to_grid(self, source_folder: str, + dest_folder: str, + interpolation_method: str = "linear", + target_resolution: int = 64, + **kwargs) -> None: + """ + Converts unstructured mesh data into a uniform grid format. Typically involves + an interpolation step from mesh vertices/cells onto a regular lattice. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_mesh_to_grid() with " + f"{interpolation_method=}, {target_resolution=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 3) TRANSFORM_DECIMATE_MESH (placeholder) + # ------------------------------------------------------------------------ + def transform_decimate_mesh(self, source_folder: str, + dest_folder: str, + decimation_ratio: float = 0.5, + **kwargs) -> None: + """ + Reduces the number of vertices/faces in a mesh to lower resolution. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_decimate_mesh() with " + f"{decimation_ratio=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 4) TRANSFORM_REGRID_DATA (placeholder) + # ------------------------------------------------------------------------ + def transform_regrid_data(self, source_folder: str, + dest_folder: str, + new_resolution: int, + **kwargs) -> None: + """ + Changes the resolution of grid data (e.g. from 128x128 to 64x64). + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_regrid_data() with " + f"{new_resolution=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 5) TRANSFORM_ADD_BOUNDARY_CHANNEL (placeholder) + # ------------------------------------------------------------------------ + def transform_add_boundary_channel(self, source_folder: str, + dest_folder: str, + boundary_label: str = "boundary_mask", + **kwargs) -> None: + """ + Inserts an extra channel marking domain boundaries, inlets, outlets, etc. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_add_boundary_channel() " + f"with {boundary_label=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 6) TRANSFORM_COORDINATE_MAPPING (placeholder) + # ------------------------------------------------------------------------ + def transform_coordinate_mapping(self, source_folder: str, + dest_folder: str, + mapping_type: str = "implicit uniform", + **kwargs) -> None: + """ + Adjust coordinate references or embed coordinate arrays for PDE fields. + E.g., convert from (i, j) indices to explicit (x, y), or from Cartesian to polar coords. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_coordinate_mapping() " + f"with {mapping_type=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 7) TRANSFORM_NORMALIZE_TENSORS (placeholder) + # ------------------------------------------------------------------------ + def transform_normalize_tensors(self, source_folder: str, + dest_folder: str, + normalization_type: str = "zscore", + **kwargs) -> None: + """ + Scales or normalizes PDE tensor fields (e.g., zero-mean, unit-variance). + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_normalize_tensors() " + f"with {normalization_type=}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 8) TRANSFORM_TIME_SUBSAMPLING (placeholder) + # ------------------------------------------------------------------------ + def transform_time_subsampling(self, source_folder: str, + dest_folder: str, + step: int = 2, + **kwargs) -> None: + """ + Subsamples time frames from a transient dataset. E.g. keep every 2nd or 5th time step. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_time_subsampling() " + f"with step={step}, source={source_folder}, dst={dest_folder}") + + # ------------------------------------------------------------------------ + # 9) Additional transformations (placeholders) + # ------------------------------------------------------------------------ + def transform_remove_outliers(self, source_folder: str, + dest_folder: str, + z_threshold: float = 3.0, + **kwargs) -> None: + """ + Removes or clips outlier values in PDE fields if they exceed a certain + statistical threshold (e.g. z-score > 3). + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_remove_outliers() " + f"with z_threshold={z_threshold}, source={source_folder}, dst={dest_folder}") + + def transform_detect_replace_nans(self, source_folder: str, + dest_folder: str, + replacement_value: float = 0.0, + **kwargs) -> None: + """ + Detects NaNs or infinite values and replaces them with a given default. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_detect_replace_nans() " + f"with replacement_value={replacement_value}, source={source_folder}, dst={dest_folder}") + + def transform_log_stats(self, source_folder: str, + dest_folder: str, + **kwargs) -> None: + """ + Logs basic statistics (mean, std, min, max) per PDE channel for QA/QC. + Could write to a local text file or log to console. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_log_stats(), " + f"source={source_folder}, dst={dest_folder}") + + def transform_multi_physics_combine(self, source_folder: str, + dest_folder: str, + fields_to_combine=None, + **kwargs) -> None: + """ + Merges data from multiple PDE fields (e.g., fluid + thermal) into a single file + or a new set of channels if needed. + """ + print(f"[OntologyTransformationEngine] Placeholder: transform_multi_physics_combine() " + f"with fields_to_combine={fields_to_combine}, source={source_folder}, dst={dest_folder}") diff --git a/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/path_utils.py b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/path_utils.py new file mode 100644 index 000000000..4f6002547 --- /dev/null +++ b/examples/cfd/darcy_autoML_active_learning/src/darcy_automl_active_learning/path_utils.py @@ -0,0 +1,187 @@ +# File: src/path_utils.py + +import os +from pathlib import Path +from typing import Optional +import logging +from .env_utils import is_running_in_docker + +import logging +logger = logging.getLogger(__name__) + +def get_absolute_path(base_path: str, subpath: Optional[str] = None) -> str: + """ + Given a base path and an optional subpath/filename, returns an absolute path. + + If base_path is already absolute, we trust it. + If it's relative, we resolve it relative to the project root or cwd + (you can define the policy below). + + Args: + base_path (str): + The root directory or base path from your config, + e.g. "/workspace/examples/cfd/darcy_autoML_active_learning/data" + subpath (str, optional): + A sub-directory or filename to join onto base_path. + e.g. "00_Generate_Data/data_desc.json" + + Returns: + Absolute path (str). + """ + # Convert to a Path object + base = Path(base_path) + + # If subpath is provided, join it + if subpath: + full_path = base / subpath + else: + full_path = base + + # Now resolve to absolute + # If the user intentionally put an absolute path in the config, + # Path(...) / subpath will remain absolute. This .resolve() + # ensures we remove any "." or "..". + return str(full_path.resolve()) + + +import os +from pathlib import Path + +def get_repo_root(): + """ + Determines the repository root directory based on the environment. + + Priority: + 1) PROJECT_ROOT environment variable (if valid directory). + - If the path starts with '/root', remove the '/root' prefix. + 2) Fallback based on current working directory (no longer going up one level). + + Returns: + Path: The path to the repository root. + """ + repo_root_env = os.environ.get("PROJECT_ROOT") # e.g., "/root/project/modulus-dls-api" in Docker + if repo_root_env: + repo_root = Path(repo_root_env).resolve() + if repo_root.is_absolute() and len(repo_root.parts) > 1 and repo_root.parts[1] == "root": + # Reconstruct the path without '/root' + adjusted_repo_root = Path(*repo_root.parts[2:]).resolve() + # Prepend '/' to keep it absolute + adjusted_repo_root = Path("/").joinpath(adjusted_repo_root) + repo_root = adjusted_repo_root + return repo_root + else: + current_path = Path.cwd().resolve() + # Use the current path itself, rather than going up a level + repo_root = current_path + return repo_root + + +def get_required_paths(repo_root: Path): + """ + Generates all required paths based on the repository root. + + Args: + repo_root (Path): The path to the repository root. + + Returns: + Paths: An instance of the Paths data class containing all required paths. + """ + darcy_project_root = repo_root / "examples" / "cfd" / "darcy_autoML_active_learning" + config_file = darcy_project_root / "config" / "config.yaml" + data_dir = darcy_project_root / "data" + results_dir = darcy_project_root / "results" + + # Optional: Validate paths + if not darcy_project_root.is_dir(): + logging.warning(f"Darcy project root '{darcy_project_root}' does not exist.") + if not config_file.is_file(): + logging.warning(f"Config file '{config_file}' does not exist.") + if not data_dir.is_dir(): + logging.warning(f"Data directory '{data_dir}' does not exist.") + if not results_dir.is_dir(): + logging.warning(f"Results directory '{results_dir}' does not exist.") + + return repo_root, darcy_project_root, config_file, data_dir, results_dir + +def identify_scenario(): + """ + Determines which scenario we are in, based on: + - Are we in Docker? (is_running_in_docker()) + - Is PROJECT_ROOT set? + + For now, we ONLY handle Scenario A1: + - A1 = Docker, workspace = `modulus-dls-api/`, PROJECT_ROOT is set. + + Any other scenario raises NotImplementedError. + """ + # Check if we're in Docker + in_docker = is_running_in_docker() + + # Check for PROJECT_ROOT + project_root_env = os.environ.get("PROJECT_ROOT", None) + + if in_docker: + if project_root_env: + return "A1" + else: + raise NotImplementedError("Docker scenario with no PROJECT_ROOT is not yet implemented.") + else: + raise NotImplementedError("Local (non-Docker) scenario is not yet implemented.") + +def get_paths_for_A1(): + """ + Scenario A1: Docker, workspace = `modulus-dls-api/`, PROJECT_ROOT is set. + + For simplicity, we assume: + - The user wants 'repo_root' to be '.' + - All subpaths are relative from '.' + + Returns: + A tuple: (repo_root, darcy_project_root, config_file, data_dir, results_dir) + """ + repo_root = Path(".") + + darcy_project_root = repo_root / "examples" / "cfd" / "darcy_autoML_active_learning" + config_file = darcy_project_root / "config" / "config.yaml" + data_dir = darcy_project_root / "data" + results_dir = darcy_project_root / "results" + + # Log all these paths at DEBUG level + logger.debug(f"Scenario A1 - repo_root: {repo_root}") + logger.debug(f"Scenario A1 - darcy_project_root: {darcy_project_root}") + logger.debug(f"Scenario A1 - config_file: {config_file}") + logger.debug(f"Scenario A1 - data_dir: {data_dir}") + logger.debug(f"Scenario A1 - results_dir: {results_dir}") + + return (repo_root, darcy_project_root, config_file, data_dir, results_dir) + +def get_paths(): + """ + Public entry point for obtaining all required paths. + + 1. Identify scenario (A1, A2, B1, etc.) + 2. Delegate to the appropriate function. + + For now, we only handle A1. + """ + scenario = identify_scenario() + + if scenario == "A1": + return get_paths_for_A1() + else: + raise NotImplementedError(f"Scenario '{scenario}' is not yet implemented.") + +if __name__ == "__main__": + # Set up logging to show DEBUG messages for this module. + logging.basicConfig(level=logging.DEBUG) + + logger.info("Running path_utils as a script. Logging level set to DEBUG for demonstration.") + + # For demonstration, let's call get_paths() directly + try: + paths = get_paths() + logger.info("Paths returned successfully:") + for p in paths: + logger.info(f" {p}") + except NotImplementedError as exc: + logger.error(f"Not implemented: {exc}") \ No newline at end of file