diff --git a/config/test_train.yaml b/config/test_train.yaml new file mode 100644 index 0000000..996d076 --- /dev/null +++ b/config/test_train.yaml @@ -0,0 +1,32 @@ +# Training params +use_cuda: true +batch_size: 16 +num_workers: 0 +lr_init: 1.0e-3 +lr_decay_rate: 0.1 +lr_decay_steps: 500 +training_steps: 500 +warmup_steps: 1 +# Evaluation +loss_report_step: 100 +save_model_step: 100 +eval_step: 100 +rollout_steps: 10 +run_validate: true +num_eval_rollout: 2 +save_video: false +# Dataset +data_path: "test_datasets/mujoco_moviA_500" +test_data_path: "test_datasets/mujoco_moviA_500.npz" +data_config: + noise_std: 3.0e-5 + connectivity_radius: 0.01 + input_seq_length: 3 +# Logging +logging_folder: "log_test" +log_level: "info" +# continue_log_from: "2024-08-29-17:05" +# Simulator params +latent_dim: 128 +message_passing_steps: 10 +mlp_layers: 2 diff --git a/config/train.json b/config/train.json deleted file mode 100644 index 8ac40b3..0000000 --- a/config/train.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "test_data_path": "datasets/mujoco_moviA_100000.npz", - "data_path": "datasets/mujoco_moviA_1000000.npz", - "use_cuda": true, - "batch_size": 32, - "num_workers": 16, - "data_config": { - "noise_std": 3e-5, - "connectivity_radius": 0.01, - "input_seq_length": 3 - }, - "logging_folder": "log", - "log_level": "info", - "lr_init": 1e-3, - "lr_decay_rate": 0.1, - "lr_decay_steps": 1e6, - "loss_report_step": 1000, - "save_model_step": 1000, - "eval_step": 1000, - "rollout_steps": 50, - "run_validate": true, - "num_eval_rollout": 10, - "save_video": true, - "training_steps": 1e6, - "warmup_steps": 50 -} diff --git a/config/train.yaml b/config/train.yaml new file mode 100644 index 0000000..494fd21 --- /dev/null +++ b/config/train.yaml @@ -0,0 +1,35 @@ +# Training params +use_cuda: true +batch_size: 64 +num_workers: 16 +lr_init: 1.0e-3 +lr_decay_rate: 0.1 +lr_decay_steps: 1.0e+6 +training_steps: 1.0e+6 +warmup_steps: 5 +# Evaluation +loss_report_step: 2000 +save_model_step: 2000 +eval_step: 2000 +rollout_steps: 50 +run_validate: true +num_eval_rollout: 10 +save_video: true +# Dataset +data_path: "datasets/mujoco_moviA_1000000.npz" +test_data_path: "datasets/mujoco_moviA_10000.npz" +data_config: + noise_std: 3.0e-5 + connectivity_radius: 0.01 + input_seq_length: 3 +# Logging +logging_folder: "log" +log_level: "info" +# continue_log_from: "2024-08-29-17:05" +# Resume training +# model_file: "log/sim-pc/models/weights_itr_482000.ckpt" +# train_state_file: "log/sim-pc/models/train_state_itr_482000.ckpt" +# Simulator params +latent_dim: 128 +message_passing_steps: 10 +mlp_layers: 2 diff --git a/fignet/data_loader.py b/fignet/data_loader.py index 2c2367b..294e116 100644 --- a/fignet/data_loader.py +++ b/fignet/data_loader.py @@ -256,7 +256,7 @@ def _get_sample(self, idx): return graph else: if os.path.exists(self._file_list[idx]): - return self._load_graph(self._file_list[idx]) + return self._transform(self._load_graph(self._file_list[idx])) else: raise FileNotFoundError diff --git a/fignet/logger.py b/fignet/logger.py index 238704c..0fc625b 100644 --- a/fignet/logger.py +++ b/fignet/logger.py @@ -33,7 +33,7 @@ class Logger: def __init__(self, config): self.config = config time_str = datetime.datetime.fromtimestamp(time.time()).strftime( - "%Y%m%d%H%M" + "%Y-%m-%d-%H:%M" ) self.tb_prefix = "" if config.get("continue_log_from") is None: diff --git a/fignet/trainer.py b/fignet/trainer.py index 07dee0f..fbf75cb 100644 --- a/fignet/trainer.py +++ b/fignet/trainer.py @@ -27,6 +27,7 @@ import torch.utils import torch.utils.data import tqdm +import yaml from torchvision import transforms as T from fignet.data_loader import MujocoDataset, ToTensor, collate_fn @@ -99,10 +100,10 @@ def __init__( # Optimization params self._lr_init = config["lr_init"] self._lr_decay_rate = config["lr_decay_rate"] - self._lr_decay_steps = config["lr_decay_steps"] - self._loss_report_step = config["loss_report_step"] - self._save_model_step = config["save_model_step"] - self._eval_step = config["eval_step"] + self._lr_decay_steps = int(config["lr_decay_steps"]) + self._loss_report_step = int(config["loss_report_step"]) + self._save_model_step = int(config["save_model_step"]) + self._eval_step = int(config["eval_step"]) self._optimizer = torch.optim.Adam( self._sim.parameters(), lr=self._lr_init ) @@ -415,3 +416,37 @@ def validate(self, step: int): self._logger.tb.add_scalar( "val/onestep_rotation_error", np.mean(onestep_r_errors), step ) + + +def create_trainer(config_file: str): + with open(os.path.join(os.getcwd(), config_file)) as f: + if config_file.endswith("yaml"): + config = yaml.safe_load(f) + elif config_file.endswith("json"): + print("Warning: json config file will not be supported soon") + import json + + config = json.load(f) + else: + raise TypeError("Unsupported config file type") + + logger = Logger(config) + if torch.cuda.is_available() and config.get("use_cuda", True): + device = torch.device("cuda") + logger.print("Using GPU") + else: + device = torch.device("cpu") + logger.print("Using CPU") + + sim = LearnedSimulator( + mesh_dimensions=3, + latent_dim=config.get("latent_dim", 128), + nmessage_passing_steps=config.get("message_passing_steps", 10), + nmlp_layers=config.get("mlp_layers", 2), + input_seq_length=config["data_config"]["input_seq_length"], + mlp_hidden_dim=config.get("latent_dim", 128), + device=device, + ) + trainer = Trainer(sim=sim, logger=logger, config=config, device=device) + + return trainer diff --git a/readme.md b/readme.md index 6751c71..c178d10 100644 --- a/readme.md +++ b/readme.md @@ -132,7 +132,10 @@ through the wheels we built if you are using python3.8 ```bash -# Install pre-compiled binary through pip if you are using python3.8 +# Install pre-compiled binary through pip if you are using python3.8, try upgrade your pip first + +# pip install --upgrade pip + pip install https://cloud.dfki.de/owncloud/index.php/s/F9EwmwWkSW8pzfL/download/eigenpy-3.8.0-0-cp38-cp38-manylinux_2_31_x86_64.whl pip install https://cloud.dfki.de/owncloud/index.php/s/Tb4baydBiRP6iN2/download/hpp_fcl-2.4.5-3-cp38-cp38-manylinux_2_31_x86_64.whl @@ -145,6 +148,9 @@ git clone https://github.com/jongyaoY/fignet cd fignet pip install -r requirements.txt pip install . + +# Setup robosuite +python -m robosuite.scripts.setup_macros ``` ## How to train @@ -160,7 +166,7 @@ You can pre-compute the graphs from the raw dataset beforehand so that the train (only the training dataset). ```bash -python scripts/preprocess_data.py --data_path=[path_to_dataset/train_dataset_name.npz] --num_workers=[default to 1] +python scripts/preprocess_data.py --data_path=[path_to_dataset/train_dataset_name.npz] --num_workers=[default to 1] --config_file=config/train.yaml ``` This process takes around 8 hours with `num_workers=8`, and will create a @@ -180,7 +186,7 @@ dataset (npz file) or the folder containing pre-computed graphs, while the test dataset should be a npz file. Also adapt `batch_size` and `num_workers` accordingly. ```bash -python scripts/train.py --config_file=config/train.json +python scripts/train.py --config_file=config/train.yaml ``` ### 3. Generate animation @@ -227,3 +233,16 @@ interaction graph networks." arXiv preprint arXiv:2212.03574 (2022). ## License [MIT License](LICENSE) + +## Known issues + +### Preprocessing script with `num_workers > 0` raises following error + +> RuntimeError: received 0 items of ancdata + +Add the following line to [preprocess_data.py](scripts/preprocess_data.py) +should solve the problem (see [here](https://discuss.pytorch.org/t/runtimeerror-received-0-items-of-ancdata/4999/4)). + +```python +torch.multiprocessing.set_sharing_strategy('file_system') +``` diff --git a/scripts/preprocess_data.py b/scripts/preprocess_data.py index 1e5e885..e18477b 100644 --- a/scripts/preprocess_data.py +++ b/scripts/preprocess_data.py @@ -21,7 +21,6 @@ # SOFTWARE. import argparse -import json import os import pickle import sys @@ -30,6 +29,7 @@ import torch import torchvision.transforms as T import tqdm +import yaml from fignet.data_loader import MujocoDataset, ToTensor @@ -70,7 +70,7 @@ def save_graph(graph, graph_i, save_path): if __name__ == "__main__": try: with open(os.path.join(os.getcwd(), args.config_file)) as f: - config = json.load(f) + config = yaml.safe_load(f) except FileNotFoundError as e: print(e) sys.exit() diff --git a/scripts/train.py b/scripts/train.py index 8ea27f0..29c8510 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -21,42 +21,13 @@ # SOFTWARE. import argparse -import json -import os -import torch - -from fignet.logger import Logger -from fignet.simulator import LearnedSimulator -from fignet.trainer import Trainer +from fignet.trainer import create_trainer parser = argparse.ArgumentParser() -parser.add_argument( - "--config_file", required=False, default="config/train.json" -) +parser.add_argument("--config_file", required=True) args = parser.parse_args() if __name__ == "__main__": - - latent_dim = 128 - - with open(os.path.join(os.getcwd(), args.config_file)) as f: - config = json.load(f) - logger = Logger(config) - if torch.cuda.is_available() and config.get("use_cuda", True): - device = torch.device("cuda") - logger.print("Using GPU") - else: - device = torch.device("cpu") - logger.print("Using CPU") - sim = LearnedSimulator( - mesh_dimensions=3, - latent_dim=latent_dim, - nmessage_passing_steps=10, - nmlp_layers=2, - input_seq_length=config["data_config"]["input_seq_length"], - mlp_hidden_dim=latent_dim, - device=device, - ) - trainer = Trainer(sim=sim, logger=logger, config=config, device=device) + trainer = create_trainer(config_file=args.config_file) trainer.train() diff --git a/setup.py b/setup.py index 73953ac..c39b9e4 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,9 @@ setup( name="fignet", packages=[ - package for package in find_packages() if package.startswith("fignet") + package + for package in find_packages() + if package.startswith("fignet") or package.startswith("rigid_fall") ], install_requires=[], eager_resources=["*"], diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 793ecd4..93eeac1 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -20,69 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import pytest -import torch +from fignet.trainer import create_trainer -from fignet.logger import Logger -from fignet.simulator import LearnedSimulator -from fignet.trainer import Trainer - -@pytest.fixture -def init_trainer(): - latent_dim = 128 - - config = { - "test_data_path": "test_datasets/mujoco_moviA_500.npz", - "data_path": "test_datasets/mujoco_moviA_500.npz", - "data_config": { - "noise_std": 3e-5, - "connectivity_radius": 0.01, - "input_seq_length": 3, - }, - "batch_size": 64, - "num_workers": 0, - # "model_file": "log_test/202407261408/models/weights_itr_49.ckpt", - # "train_state_file": "log_test/202407261408/models/train_state_itr_49.ckpt", - # "continue_log_from": "202407261408", - "use_cuda": False, - "logging_folder": "log_test", - "log_level": "info", - "lr_init": 1e-3, - "lr_decay_rate": 0.1, - "lr_decay_steps": 1e6, - "loss_report_step": 100, - "save_model_step": 100, - "eval_step": 100, - "training_steps": 500, - # "clip_norm": 1e-2, - "rollout_steps": 10, - "run_validate": True, - "num_eval_rollout": 1, - "save_video": False, - "warmup_steps": 10, - } - logger = Logger(config) - if torch.cuda.is_available() and config["use_cuda"]: - device = torch.device("cuda") - logger.print("Using GPU") - else: - device = torch.device("cpu") - logger.print("Using CPU") - sim = LearnedSimulator( - mesh_dimensions=3, - latent_dim=latent_dim, - nmessage_passing_steps=10, - nmlp_layers=2, - input_seq_length=config["data_config"]["input_seq_length"], - mlp_hidden_dim=latent_dim, - device=device, - ) - trainer = Trainer(sim=sim, logger=logger, config=config, device=device) - - return trainer - - -def test_simulator(init_trainer): - trainer = init_trainer +def test_simulator(): + trainer = create_trainer("config/test_train.yaml") trainer.train()