Skip to content

Commit

Permalink
Update pytorch-lightning, pytorch, and docker compose (#34)
Browse files Browse the repository at this point in the history
* Remove deprecated gpus argument

* Formatting

* Fix epoch collectors

* Fix device error

* Use latest pytorch and cuda

* Need to use docker-compose, see docker/compose#9681

* This works but it's not ideal

* Better, at least as long as buildkit does not allow access to the GPU
during build.

See docker/compose#9681

* Relax these
  • Loading branch information
benemer authored Jan 16, 2024
1 parent 5e45832 commit b04651a
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 32 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel

ENV PROJECT=/mos4d
RUN mkdir -p $PROJECT
Expand All @@ -21,7 +21,7 @@ RUN rm -rf $PROJECT

RUN pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps \
--install-option="--force_cuda" \
--install-option="--cuda_home=/usr/local/cuda-11.3" \
--install-option="--cuda_home=/usr/local/cuda-11.7" \
--install-option="--blas=openblas"


Expand Down
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ export GROUP_ID:=$(shell id -g)

build:
@echo Build docker image...
@docker-compose build project
@DOCKER_BUILDKIT=0 docker compose build project

test: check-env
@echo NVIDIA and CUDA setup
@docker-compose run project nvidia-smi
@docker compose run project nvidia-smi
@echo Pytorch CUDA setup installed?
@docker-compose run project python3 -c "import torch; print(torch.cuda.is_available())"
@docker compose run project python3 -c "import torch; print(torch.cuda.is_available())"
@echo MinkowskiEngine installed?
@docker-compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"
@docker compose run project python3 -c "import MinkowskiEngine as ME; print(ME.__version__)"

run: check-env
@docker-compose run project
@docker compose run project

clean:
@echo Removing docker image...
@docker-compose rm project
@docker compose rm project


check-env:
Expand Down
2 changes: 1 addition & 1 deletion scripts/predict_confidences.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def main(weights, sequence, dt, poses, transform):
model.freeze()

# Setup trainer
trainer = Trainer(gpus=1, logger=False)
trainer = Trainer(accelerator="gpu", devices=1, logger=False)

# Infer!
trainer.predict(model, data.test_dataloader())
Expand Down
4 changes: 2 additions & 2 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
default=None,
)
def main(config, weights, checkpoint):

if checkpoint:
cfg = torch.load(checkpoint)["hyper_parameters"]
else:
Expand Down Expand Up @@ -72,7 +71,8 @@ def main(config, weights, checkpoint):

# Setup trainer
trainer = Trainer(
gpus=1,
accelerator="gpu",
devices=1,
logger=tb_logger,
max_epochs=cfg["TRAIN"]["MAX_EPOCH"],
accumulate_grad_batches=cfg["TRAIN"]["ACC_BATCHES"],
Expand Down
13 changes: 6 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
description="Receding Moving Object Segmentation in 3D LiDAR Data Using Sparse 4D Convolutions",
packages=find_packages(where="src"),
install_requires=[
"Click>=7.0",
"numpy>=1.20.3",
"pytorch_lightning>=1.6.4",
"PyYAML>=6.0",
"tqdm>=4.62.3",
"torch",
"ninja",
"Click",
"numpy",
"pytorch_lightning",
"tensorboard",
"PyYAML",
"tqdm",
],
)
1 change: 0 additions & 1 deletion src/mos4d/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@ def __getitem__(self, idx):
past_files = self.filenames[seq][from_idx : to_idx : self.skip]
list_past_point_clouds = [self.read_point_cloud(f) for f in past_files]
for i, pcd in enumerate(list_past_point_clouds):

# Transform to current viewpoint
if self.transform:
from_pose = self.poses[seq][past_indices[i]]
Expand Down
1 change: 0 additions & 1 deletion src/mos4d/models/MinkowskiEngine/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def __init__(self, in_channels, out_channels, D=3):
self.weight_initialization()

def network_initialization(self, in_channels, out_channels, D):

self.inplanes = self.INIT_DIM
self.conv1 = nn.Sequential(
ME.MinkowskiConvolution(
Expand Down
1 change: 0 additions & 1 deletion src/mos4d/models/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def __init__(self, n_classes, ignore_index):
self.ignore_index = ignore_index

def compute_confusion_matrix(self, pred_logits: torch.Tensor, gt_labels: torch.Tensor):

# Set ignored classes to -inf to not influence softmax
pred_logits[:, self.ignore_index] = -float("inf")

Expand Down
26 changes: 15 additions & 11 deletions src/mos4d/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def __init__(self, hparams: dict):

self.ClassificationMetrics = ClassificationMetrics(self.n_classes, self.ignore_index)

self.training_step_outputs = []
self.validation_step_outputs = []

def getLoss(self, out: ME.TensorField, past_labels: list):
loss = self.MOSLoss.compute_loss(out, past_labels)
return loss
Expand All @@ -70,20 +73,20 @@ def training_step(self, batch: tuple, batch_idx, dataloader_index=0):
self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
)

self.training_step_outputs.append(dict_confusion_matrix)
torch.cuda.empty_cache()
return {"loss": loss, "dict_confusion_matrix": dict_confusion_matrix}

def training_epoch_end(self, training_step_outputs):
list_dict_confusion_matrix = [
output["dict_confusion_matrix"] for output in training_step_outputs
]
return loss

def on_train_epoch_end(self):
for s in range(self.n_past_steps):
agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
for dict_confusion_matrix in list_dict_confusion_matrix:
for dict_confusion_matrix in self.training_step_outputs:
agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
self.log("train_moving_iou_step{}".format(s), iou[2].item())

self.training_step_outputs.clear()
torch.cuda.empty_cache()

def validation_step(self, batch: tuple, batch_idx):
Expand All @@ -101,17 +104,18 @@ def validation_step(self, batch: tuple, batch_idx):
self.get_step_confusion_matrix(out, past_labels, s).detach().cpu()
)

self.validation_step_outputs.append(dict_confusion_matrix)
torch.cuda.empty_cache()
return dict_confusion_matrix

def validation_epoch_end(self, validation_step_outputs):
def on_validation_epoch_end(self):
for s in range(self.n_past_steps):
agg_confusion_matrix = torch.zeros(self.n_classes, self.n_classes)
for dict_confusion_matrix in validation_step_outputs:
for dict_confusion_matrix in self.validation_step_outputs:
agg_confusion_matrix = agg_confusion_matrix.add(dict_confusion_matrix[s])
iou = self.ClassificationMetrics.getIoU(agg_confusion_matrix)
self.log("val_moving_iou_step{}".format(s), iou[2].item())

self.validation_step_outputs.clear()
torch.cuda.empty_cache()

def predict_step(self, batch: tuple, batch_idx: int, dataloader_idx: int = None):
Expand Down Expand Up @@ -163,8 +167,8 @@ def get_step_confusion_matrix(self, out, past_labels, step):
t = round(-step * self.dt_prediction, 3)
mask = out.coordinates[:, -1].isclose(torch.tensor(t))
pred_logits = out.features[mask].detach().cpu()
gt_labels = torch.cat(past_labels, dim=0).detach().cpu()
gt_labels = gt_labels[mask][:, 0]
gt_labels = torch.cat(past_labels, dim=0)
gt_labels = gt_labels[mask][:, 0].detach().cpu()
confusion_matrix = self.ClassificationMetrics.compute_confusion_matrix(
pred_logits, gt_labels
)
Expand Down

0 comments on commit b04651a

Please sign in to comment.