Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessing: Add WSI prefetching #32

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion stamp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def run_cli(args: argparse.Namespace):
only_feature_extraction=c.only_feature_extraction,
keep_dir_structure=c.keep_dir_structure if 'keep_dir_structure' in c else False,
device=c.device,
normalization_template=normalization_template_path
normalization_template=normalization_template_path,
preload_wsi=c.preload_wsi if 'preload_wsi' in c else False
)
case "train":
require_configs(
Expand Down
1 change: 1 addition & 0 deletions stamp/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ preprocessing:
only_feature_extraction: false # Only perform feature extraction (intermediate images (background rejected, [normalized]) have to exist)
cores: 8 # CPU cores to use
device: cuda:0 # device to run feature extraction on (cpu, cuda, cuda:0, etc.)
preload_wsi: true # Preload the whole-slide image into temporary directory (helpful in case the slides are accessed over network)

modeling:
clini_table: # Path to clini_table file (.xlsx or .csv)
Expand Down
144 changes: 83 additions & 61 deletions stamp/preprocessing/wsi_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from contextlib import contextmanager
import logging
import os
import shutil
import openslide
from tqdm import tqdm
import PIL
Expand All @@ -19,6 +20,7 @@
from datetime import timedelta
from pathlib import Path
from random import shuffle
import tempfile
import torch
from typing import Optional
from .helpers import stainNorm_Macenko
Expand All @@ -41,6 +43,8 @@ def lock_file(slide_path: Path):
Path(f"{slide_path}.lock").touch()
except PermissionError:
pass # No write permissions for wsi directory
except OSError:
pass # No write permissions for wsi directory
try:
yield
finally:
Expand All @@ -53,6 +57,9 @@ def test_wsidir_write_permissions(wsi_dir: Path):
except PermissionError:
logging.warning("No write permissions for wsi directory! If multiple stamp processes are running "
"in parallel, the final summary may show an incorrect number of slides processed.")
except OSError:
logging.warning("No write permissions for wsi directory! If multiple stamp processes are running "
"in parallel, the final summary may show an incorrect number of slides processed.")
finally:
clean_lockfile(testfile)

Expand All @@ -67,7 +74,7 @@ def save_image(image, path: Path):
def preprocess(output_dir: Path, wsi_dir: Path, model_path: Path, cache_dir: Path, norm: bool,
del_slide: bool, only_feature_extraction: bool, cache: bool = True, cores: int = 8,
target_microns: int = 256, patch_size: int = 224, keep_dir_structure: bool = False,
device: str = "cuda", normalization_template: Path = None, feat_extractor: str = "ctp"):
device: str = "cuda", normalization_template: Path = None, feat_extractor: str = "ctp", preload_wsi: bool = False):
# Clean up potentially old leftover .lock files
for lockfile in wsi_dir.glob("**/*.lock"):
if time.time() - os.path.getmtime(lockfile) > 20:
Expand Down Expand Up @@ -168,73 +175,88 @@ def preprocess(output_dir: Path, wsi_dir: Path, model_path: Path, cache_dir: Pat
canny_norm_patch_list, coords_list, total = process_slide_jpg(slide_jpg)
print(f"Loaded {img_name}, {len(canny_norm_patch_list)}/{total} tiles remain")
else:
try:
slide = openslide.OpenSlide(slide_url)
except openslide.lowlevel.OpenSlideUnsupportedFormatError:
logging.error("Unsupported format for slide, continuing...")
error_slides.append(slide_name)
continue
except Exception as e:
logging.error(f"Failed loading slide, continuing... Error: {e}")
error_slides.append(slide_name)
continue
with tempfile.TemporaryDirectory() as temp_dir:
try:
if preload_wsi:
slide_tmp_dir = Path(temp_dir)
slide_tmp_file = slide_tmp_dir / slide_url.name

start_time = time.time()
try:
slide_array = load_slide(slide=slide, target_mpp=target_mpp, cores=cores)
except MPPExtractionError:
if del_slide:
logging.error("MPP missing in slide metadata, deleting slide and continuing...")
if os.path.exists(slide_url):
os.remove(slide_url)
else:
logging.error("MPP missing in slide metadata, continuing...")
error_slides.append(slide_name)
continue
except openslide.lowlevel.OpenSlideError as e:
print("")
logging.error(f"Failed loading slide, continuing... Error: {e}")
error_slides.append(slide_name)
continue
shutil.copy(slide_url, slide_tmp_file)

# Some slide formats (.mrsx) come with an additional directory which needs to be transferred as well
slide_folder_name = slide_url.with_suffix('')
if slide_folder_name.is_dir():
slide_folder_tmp = slide_tmp_dir / slide_folder_name.name
shutil.copytree(slide_folder_name, slide_folder_tmp)

# Remove .SVS from memory
del slide
print(f"\nLoaded slide: {time.time() - start_time:.2f} seconds")
print(f"\nSize of WSI: {slide_array.shape}")

if cache:
# Save raw .svs jpg
raw_image = PIL.Image.fromarray(slide_array)
save_image(raw_image, slide_cache_dir/"slide.jpg")
slide = openslide.OpenSlide(slide_tmp_file)
else:
slide = openslide.OpenSlide(slide_url)
except openslide.lowlevel.OpenSlideUnsupportedFormatError:
logging.error("Unsupported format for slide, continuing...")
error_slides.append(slide_name)
continue
except Exception as e:
logging.error(f"Failed loading slide, continuing... Error: {e}")
error_slides.append(slide_name)
continue

#Do edge detection here and reject unnecessary tiles BEFORE normalisation
bg_reject_array, rejected_tile_array, patch_shapes = reject_background(img=slide_array, patch_size=patch_shape, step=step_size, cores=cores)
start_time = time.time()
try:
slide_array = load_slide(slide=slide, target_mpp=target_mpp, cores=cores)
except MPPExtractionError:
if del_slide:
logging.error("MPP missing in slide metadata, deleting slide and continuing...")
if os.path.exists(slide_url):
os.remove(slide_url)
else:
logging.error("MPP missing in slide metadata, continuing...")
error_slides.append(slide_name)
continue
except openslide.lowlevel.OpenSlideError as e:
print("")
logging.error(f"Failed loading slide, continuing... Error: {e}")
error_slides.append(slide_name)
continue

start_time = time.time()
# Pass raw slide_array for getting the initial concentrations, bg_reject_array for actual normalisation
if norm:
print(f"Normalising slide...")
canny_img, img_norm_wsi_jpg, canny_norm_patch_list, coords_list = normalizer.transform(slide_array, bg_reject_array,
rejected_tile_array, patch_shapes, cores=cores)
print(f"\nNormalised slide: {time.time() - start_time:.2f} seconds")
# Remove .SVS from memory
del slide
print(f"\nLoaded slide: {time.time() - start_time:.2f} seconds")
print(f"\nSize of WSI: {slide_array.shape}")

if cache:
save_image(img_norm_wsi_jpg, slide_cache_dir/"norm_slide.jpg")
else:
canny_img, canny_norm_patch_list, coords_list = get_raw_tile_list(slide_array.shape, bg_reject_array,
rejected_tile_array, patch_shapes)
# Save raw .svs jpg
raw_image = PIL.Image.fromarray(slide_array)
save_image(raw_image, slide_cache_dir/"slide.jpg")

if cache:
print("Saving Canny background rejected image...")
save_image(canny_img, slide_cache_dir/"canny_slide.jpg")
#Do edge detection here and reject unnecessary tiles BEFORE normalisation
bg_reject_array, rejected_tile_array, patch_shapes = reject_background(img=slide_array, patch_size=patch_shape, step=step_size, cores=cores)

# Remove original slide jpg from memory
del slide_array

# Optionally remove the original slide from harddrive
if del_slide:
print("Deleting slide from local folder...")
if os.path.exists(slide_url):
os.remove(slide_url)
start_time = time.time()
# Pass raw slide_array for getting the initial concentrations, bg_reject_array for actual normalisation
if norm:
print(f"Normalising slide...")
canny_img, img_norm_wsi_jpg, canny_norm_patch_list, coords_list = normalizer.transform(slide_array, bg_reject_array,
rejected_tile_array, patch_shapes, cores=cores)
print(f"\nNormalised slide: {time.time() - start_time:.2f} seconds")
if cache:
save_image(img_norm_wsi_jpg, slide_cache_dir/"norm_slide.jpg")
else:
canny_img, canny_norm_patch_list, coords_list = get_raw_tile_list(slide_array.shape, bg_reject_array,
rejected_tile_array, patch_shapes)

if cache:
print("Saving Canny background rejected image...")
save_image(canny_img, slide_cache_dir/"canny_slide.jpg")

# Remove original slide jpg from memory
del slide_array

# Optionally remove the original slide from harddrive
if del_slide:
print("Deleting slide from local folder...")
if os.path.exists(slide_url):
os.remove(slide_url)

print(f"\nExtracting {model_name} features from slide...")
start_time = time.time()
Expand Down