Skip to content

Commit

Permalink
4 introduce ptl (omron-sinicx#9)
Browse files Browse the repository at this point in the history
* Introduce PTL functions

* Update checkpoint

* Update dockerfile

* fix scripts
  • Loading branch information
yonetaniryo authored Dec 19, 2022
1 parent e3e85c4 commit 436a2ba
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 203 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.venv
**/*.egg-info/
**/__pycache__/
**/lightning_logs
.ipynb_checkpoints/
model
outputs
Expand Down
21 changes: 17 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04

WORKDIR /workspace
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get -y update && apt-get -y install --no-install-recommends software-properties-common libgl1-mesa-dev wget libssl-dev

RUN apt-get -y install --no-install-recommends python3.8-dev python3.8-distutils python3-pip python3.8-venv
# Set default python
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1

# clear cache
RUN rm -rf /var/lib/apt/lists/*

RUN pip install -U pip setuptools
RUN pip3 install -U pip distlib setuptools wheel

WORKDIR /workspace

WORKDIR /workspace
COPY src/ src/
COPY pyproject.toml .
RUN pip install -e .[dev]
RUN pip3 install -e .[dev]
RUN pip3 uninstall -y torch torchvision
RUN pip3 install torch==1.12.1 torchvision==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
37 changes: 21 additions & 16 deletions example.ipynb

Large diffs are not rendered by default.

Binary file not shown.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"numpy>=1.19.2",
"tensorboard>=2.5",
"moviepy>=1.0.3",
"pytorch-lightning==1.8.5.post0",
"jupyterlab",
"matplotlib",
"tensorboardx",
Expand Down
8 changes: 4 additions & 4 deletions scripts/create_gif.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json
import os

import hydra
import moviepy.editor as mpy
import torch
from neural_astar.planner import NeuralAstar, VanillaAstar
from neural_astar.utils.data import create_dataloader
from neural_astar.utils.training import visualize_results
from neural_astar.utils.training import load_from_ptl_checkpoint, visualize_results


@hydra.main(config_path="config", config_name="create_gif")
Expand All @@ -15,7 +13,9 @@ def main(config):

if config.planner == "na":
planner = NeuralAstar(encoder_arch=config.encoder)
planner.load_state_dict(torch.load(f"{config.modeldir}/{dataname}/best.pt"))
planner.load_state_dict(
load_from_ptl_checkpoint(f"{config.modeldir}/{dataname}")
)
else:
planner = VanillaAstar()

Expand Down
120 changes: 17 additions & 103 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,17 @@
Author: Ryo Yonetani
Affiliation: OSX
"""
from __future__ import annotations

import os
import subprocess
from datetime import datetime

import hydra
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from neural_astar.planner import NeuralAstar, VanillaAstar
from neural_astar.planner import NeuralAstar
from neural_astar.utils.data import create_dataloader
from neural_astar.utils.training import (
Metrics,
calc_metrics,
run_planner,
set_global_seeds,
visualize_results,
)
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from neural_astar.utils.training import PlannerModule, set_global_seeds
from pytorch_lightning.callbacks import ModelCheckpoint


@hydra.main(config_path="config", config_name="train")
Expand All @@ -36,98 +26,22 @@ def main(config):
val_loader = create_dataloader(
config.dataset + ".npz", "valid", config.params.batch_size, shuffle=False
)
test_loader = create_dataloader(
config.dataset + ".npz", "test", config.params.batch_size, shuffle=False
)

# planners
device = "cuda" if torch.cuda.is_available() else "cpu"
neural_astar = NeuralAstar(encoder_arch=config.encoder, Tmax=config.Tmax)
neural_astar.to(device)
vanilla_astar = VanillaAstar()
vanilla_astar.to(device)

# training setup
opt = optim.RMSprop(neural_astar.parameters(), lr=config.params.lr)
criterion = nn.L1Loss()
checkpoint_callback = ModelCheckpoint(
monitor="metrics/h_mean", save_weights_only=True, mode="max"
)

# logger setup
module = PlannerModule(neural_astar, config)
logdir = f"{config.logdir}/{os.path.basename(config.dataset)}"
writer = SummaryWriter(f"{logdir}/tb")
h_mean_best = -1.0

for e in range(config.params.num_epochs):
train_loss, val_loss, p_opt, p_exp, h_mean = 0.0, 0.0, 0.0, 0.0, 0.0

# training
for batch in tqdm(train_loader, desc="training", ncols=60):
neural_astar.train()
loss, na_outputs = run_planner(batch, neural_astar, criterion)
train_loss += loss.item()
opt.zero_grad()
loss.backward()
opt.step()
train_loss /= len(train_loader)

# validation
with torch.no_grad():
for batch in tqdm(val_loader, desc="validation", ncols=60):
neural_astar.eval()
loss, na_outputs = run_planner(batch, neural_astar, criterion)
_, va_outputs = run_planner(batch, vanilla_astar, criterion)
metrics = calc_metrics(na_outputs, va_outputs)
val_loss += loss
p_opt += metrics.p_opt
p_exp += metrics.p_exp
h_mean += metrics.h_mean
val_loss /= len(val_loader)
p_opt /= len(val_loader)
p_exp /= len(val_loader)
h_mean /= len(val_loader)

# logging
print(
f"[epoch:{e:03d}] train_loss:{train_loss:.2e}, val_loss:{val_loss:.2e}, ",
Metrics(p_opt, p_exp, h_mean),
)

writer.add_scalar("metrics/train_loss", train_loss, e)
writer.add_scalar("metrics/val_loss", val_loss, e)
writer.add_scalar("metrics/p_opt", p_opt, e)
writer.add_scalar("metrics/p_exp", p_exp, e)
writer.add_scalar("metrics/h_mean", h_mean, e)

va_results = visualize_results(batch[0], va_outputs)
na_results = visualize_results(batch[0], na_outputs)
writer.add_image("vis/astar", va_results, e, dataformats="HWC")
writer.add_image("vis/neural-astar", na_results, e, dataformats="HWC")

# checkpointing
if h_mean > h_mean_best:
print(f"best score updated: {h_mean_best:0.3f} -> {h_mean:0.3f}")
h_mean_best = h_mean
subprocess.run(["rm", "-rf", f"{logdir}/best.pt"])
torch.save(neural_astar.state_dict(), f"{logdir}/best.pt")
writer.close()

# testing
neural_astar.load_state_dict(torch.load(f"{logdir}/best.pt"))
p_opt, p_exp, h_mean = 0.0, 0.0, 0.0
with torch.no_grad():
for batch in tqdm(test_loader, desc="test", ncols=60):
neural_astar.eval()
loss, na_outputs = run_planner(batch, neural_astar, criterion)
_, va_outputs = run_planner(batch, vanilla_astar, criterion)
metrics = calc_metrics(na_outputs, va_outputs)
p_opt += metrics.p_opt
p_exp += metrics.p_exp
h_mean += metrics.h_mean
p_opt /= len(test_loader)
p_exp /= len(test_loader)
h_mean /= len(test_loader)

print(f"[final performance]", Metrics(p_opt, p_exp, h_mean))
np.savetxt(f"{logdir}/score.txt", [p_opt, p_exp, h_mean])
trainer = pl.Trainer(
accelerator="gpu" if torch.cuda.is_available() else "cpu",
log_every_n_steps=1,
default_root_dir=logdir,
max_epochs=config.params.num_epochs,
callbacks=[checkpoint_callback],
)
trainer.fit(module, train_loader, val_loader)


if __name__ == "__main__":
Expand Down
127 changes: 51 additions & 76 deletions src/neural_astar/utils/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,113 +6,88 @@
from __future__ import annotations

import random
from dataclasses import dataclass
from typing import Sequence, Tuple, Union
import re
from glob import glob

import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim
from neural_astar.planner.astar import VanillaAstar
from neural_astar.planner.differentiable_astar import AstarOutput
from PIL import Image
from torch.nn.modules.loss import _Loss
from torchvision.utils import make_grid

EPS = 1e-10


@dataclass
class Metrics:
p_opt: float
p_exp: float
h_mean: float

def __repr__(self):
return f"optimality: {self.p_opt:0.3f}, efficiency: {self.p_exp:0.3f}, h_mean: {self.h_mean:0.3f}"


def run_planner(
batch: Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
planner: nn.Module,
criterion: _Loss,
) -> Tuple[torch.tensor, AstarOutput]:
def load_from_ptl_checkpoint(checkpoint_path: str) -> dict:
"""
Run planner on a given batch
Load model weights from PyTorch Lightning checkpoint.
Args:
batch (Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]): input batch
planner (nn.Module): planner
criterion (_Loss): loss function
checkpoint_path (str): (parent) directory where .ckpt is stored.
Returns:
Tuple[torch.tensor, AstarOutput]: computed loss + planner output
dict: model state dict
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
map_designs, start_maps, goal_maps, opt_trajs = batch
map_designs = map_designs.to(device)
start_maps = start_maps.to(device)
goal_maps = goal_maps.to(device)
opt_trajs = opt_trajs.to(device)
planner_outputs = planner(map_designs, start_maps, goal_maps)
loss = criterion(planner_outputs.histories, opt_trajs)

return loss, planner_outputs
ckpt_file = sorted(glob(f"{checkpoint_path}/**/*.ckpt", recursive=True))[-1]
print(f"load {ckpt_file}")
state_dict = torch.load(ckpt_file)["state_dict"]
state_dict_extracted = dict()
for key in state_dict:
if "planner" in key:
state_dict_extracted[re.split("planner.", key)[-1]] = state_dict[key]

return state_dict_extracted

def calc_metrics(na_outputs: AstarOutput, va_outputs: AstarOutput) -> Metrics:
"""
Calculate opt, exp, and hmean metrics for problem instances each with a single starting point

Args:
na_outputs (AstarOutput): outputs from Neural A*
va_outputs (AstarOutput): outputs from vanilla A*
Returns:
Metrics: opt, exp, and hmean values
"""
pathlen_astar = va_outputs.paths.sum((1, 2, 3)).detach().cpu().numpy()
pathlen_na = na_outputs.paths.sum((1, 2, 3)).detach().cpu().numpy()
p_opt = (pathlen_astar == pathlen_na).mean()
class PlannerModule(pl.LightningModule):
def __init__(self, planner, config):
super().__init__()
self.planner = planner
self.vanilla_astar = VanillaAstar()
self.config = config

exp_astar = va_outputs.histories.sum((1, 2, 3)).detach().cpu().numpy()
exp_na = na_outputs.histories.sum((1, 2, 3)).detach().cpu().numpy()
p_exp = np.maximum((exp_astar - exp_na) / exp_astar, 0.0).mean()
def forward(self, map_designs, start_maps, goal_maps):
return self.planner(map_designs, start_maps, goal_maps)

h_mean = 2.0 / (1.0 / (p_opt + EPS) + 1.0 / (p_exp + EPS))
def configure_optimizers(self) -> torch.optim.Optimizer:
return torch.optim.RMSprop(self.planner.parameters(), self.config.params.lr)

return Metrics(p_opt, p_exp, h_mean)
def training_step(self, train_batch, batch_idx):
map_designs, start_maps, goal_maps, opt_trajs = train_batch
outputs = self.forward(map_designs, start_maps, goal_maps)
loss = nn.L1Loss()(outputs.histories, opt_trajs)
self.log("metrics/train_loss", loss)

return loss

def calc_metrics_from_multiple_results(
na_outputs_list: Sequence[AstarOutput], va_outputs_list: Sequence[AstarOutput]
) -> Metrics:
"""
Calculate opt, exp, and hmean metrics for problem instances each with multiple starting points
Args:
na_outputs (Sequence[AstarOutput]): Sequence of outputs from Neural A*
va_outputs (Sequence[AstarOutput]): Sequence of outputs from vanilla A*
Returns:
Metrics: opt, exp, and hmean values
"""
p_opt_list, p_exp_list = [], []
for na_outputs, va_outputs in zip(na_outputs_list, va_outputs_list):
def validation_step(self, val_batch, batch_idx):
map_designs, start_maps, goal_maps, opt_trajs = val_batch
outputs = self.forward(map_designs, start_maps, goal_maps)
loss = nn.L1Loss()(outputs.histories, opt_trajs)
va_outputs = self.vanilla_astar(map_designs, start_maps, goal_maps)
pathlen_astar = va_outputs.paths.sum((1, 2, 3)).detach().cpu().numpy()
pathlen_na = na_outputs.paths.sum((1, 2, 3)).detach().cpu().numpy()
p_opt_list.append(pathlen_astar == pathlen_na)
pathlen_model = outputs.paths.sum((1, 2, 3)).detach().cpu().numpy()
p_opt = (pathlen_astar == pathlen_model).mean()

exp_astar = va_outputs.histories.sum((1, 2, 3)).detach().cpu().numpy()
exp_na = na_outputs.histories.sum((1, 2, 3)).detach().cpu().numpy()
p_exp_list.append(np.maximum((exp_astar - exp_na) / exp_astar, 0.0))
p_opt = np.vstack(p_opt_list).mean(0)
p_exp = np.vstack(p_exp_list).mean(0)
h_mean = 2.0 / (1.0 / (p_opt + EPS) + 1.0 / (p_exp + EPS))
exp_na = outputs.histories.sum((1, 2, 3)).detach().cpu().numpy()
p_exp = np.maximum((exp_astar - exp_na) / exp_astar, 0.0).mean()

h_mean = 2.0 / (1.0 / (p_opt + 1e-10) + 1.0 / (p_exp + 1e-10))

self.log("metrics/val_loss", loss)
self.log("metrics/p_opt", p_opt)
self.log("metrics/p_exp", p_exp)
self.log("metrics/h_mean", h_mean)

return Metrics(p_opt.mean(), p_exp.mean(), h_mean.mean())
return loss


def visualize_results(
map_designs: torch.tensor, planner_outputs: Union[AstarOutput, dict], scale: int = 1
map_designs: torch.tensor, planner_outputs: AstarOutput, scale: int = 1
) -> np.ndarray:
"""
Create a visualization of search results
Expand Down

0 comments on commit 436a2ba

Please sign in to comment.