diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0e0c7a3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,111 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# IDE settings +.vscode/.envrc +.direnv +*~ +*.root +*.npz +*.dat +*.pdf diff --git a/scripts/README_extract.md b/scripts/README_extract.md new file mode 100644 index 0000000..b07af5f --- /dev/null +++ b/scripts/README_extract.md @@ -0,0 +1,48 @@ +# Get `toy-sp` (`toy1`) data: +Toy1 data can be downloaded from https://www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/. +1. To download the both image `.npz` files and meta data: + > `wget -r --no-parent https://www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/` +1. To download only the image `.npz` files: + > `wget -r --no-parent https://www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir` +1. To get images generate with one particular seed (example): + > `wget -r --no-parent https://www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir/toyzero-100-10-17698759_1048280_804.tar` + +When the downloading is done, we can find the tar files that contains the images in the following folder: +> `www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir` + +# Extract `toy-sp` (`toy1`) data: +We use `extract_toy-sp.py` to extract the tar files and get a merged dataset contains all images (generated with different seeds) + +Usages: +1. `python extract_toy-sp.py -d www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir/ -s seedwise -m merged -v True`; + Note: Sampled pairs of images will be saved to `flora:/tmp/LS4GAN/toy-sp/sample_plots/` +1. `python extract_toy-sp.py -d www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir/ -m merged -v True`; + Note: If seedwise folder is not specified, the folders for each seed will be saved at `flora:/tmp/LS4GAN/toy-sp/seedwise/`. +1. `python extract_toy-sp.py -d www.phy.bnl.gov/~bviren/tmp/ls4gan/data/toy1/job_output_dir/ -m merged`; + Note: Do not plot sampled pairs. + +# Bulk data downloading and pre-generated window csvs: +Please find in `flora:/data/LS4GAN` folder of the `flora` machine. +1. **Merged data**: A gathering of all toy1 image `.npz` files Brett generated. +1. **Seed-wise data**: The seed-wise data contains the same set of toy1 images as the merged data, but grouped and saved by the seed. + + | file | md5sum | + | ------------------------------ | -------------------------------- | + | 2021-09-01_toy-sp_merged.tgz | bfbd560f65955d66d5d174f0f410eee8 | + | 2021-09-01_toy-sp_seedwise.tgz | b5dde35f59283313998f5ecea51b3340 | +1. **Pre-generated of windows csvs**: in the `flora:/data/LS4GAN/toy-sp_merged_windows`. + - Shared parameters for `toytools/scripts/preprocess`: + - `--plane`: U; + - `-n` (numbers of windows per image): 4; + - Shared parameters for `toytools/scripts/train_test_split`: + - `--test-size`: 4000; + - For window size `128x128`: + - `--min-signal`: 300, 500; + - For window size `512x512`: + - `--min-signal`: 500, 1000, 2000; + - Window file naming convention: + > `ms--.csv`; + > + > `ms--_train.csv`; + > + > `ms--_test.csv` diff --git a/scripts/extract_toy-sp.py b/scripts/extract_toy-sp.py new file mode 100644 index 0000000..9b7eb91 --- /dev/null +++ b/scripts/extract_toy-sp.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# coding: utf-8 + + + +import pandas as pd +import numpy as np +from pathlib import Path +import tarfile +import os +import shutil +import sys +from tqdm import tqdm +import argparse + +import matplotlib.pyplot as plt + + + +# Extraction functions +def extract_files(tar, member_folder, file_extension, save_folder='.'): + """ + Extract all files with from folder of a tarfile object + Input: + - tar (tarfile instance): a tarfile instance to extract from. + - member_folder (str): a folder the contains the files you need. Must be a member in . + - file_extension (str): file extension for the files you need. + - save_folder (str): the folder to extract to. + """ + + Path(save_folder).mkdir(exist_ok=True, parents=True) + # locate all the files in the member_folder that have file_extension as extension. + subdir_and_files = [ + tarinfo for tarinfo in tar.getmembers() + if tarinfo.name.startswith(member_folder) and + tarinfo.name.endswith(file_extension) + ] + + print(f'\tsave {len(subdir_and_files)} {file_extension} files to {save_folder}') + + # extract files and move them to the save_folder + tar.extractall(members=subdir_and_files, path=save_folder) + fnames = Path(f'{save_folder}/{member_folder}').glob(f'*.{file_extension}') + for fname in fnames: + shutil.move(fname, save_folder) + + # Remove the chain of parent folders (now empty) + path_base = os.path.normpath(member_folder).split(os.sep)[0] + shutil.rmtree(f'{save_folder}/{path_base}') + +def get_seed(fname): + return fname.split('-')[-1].split('_')[0] + +def extract_toy_sp(tarfname, folder_base): + """ + extract all the npz files (both fake and real) from a tar file named + Input: + - tarfname (str): the .tar file name. + - folder_base (str): the folder under where the seed- folders are located + """ + seed = get_seed(Path(tarfname).stem) + print(f'seed = {seed}') + save_folder = Path(f'{folder_base}/seed-{seed}') + if save_folder.exists(): + shutil.rmtree(save_folder) + + with tarfile.open(tarfname, 'r') as tar: + try: + fake = [tarinfo.name for tarinfo in tar.getmembers() if tarinfo.name.endswith('fake-fake')][0] + real = [tarinfo.name for tarinfo in tar.getmembers() if tarinfo.name.endswith('real-fake')][0] + extract_files(tar, fake, 'npz', save_folder/'fake') + extract_files(tar, real, 'npz', save_folder/'real') + print() + except: + print(f'There is something wrong with the tar file {tarfname}') + + +def parse_cmdargs(): + parser = argparse.ArgumentParser("Extract toy-sp (toy1) data Brett generated") + + parser.add_argument( + '--data_path', + '-d', + help = 'The loacation of the tar files', + dest = 'data_path', + type = str, + ) + + parser.add_argument( + '--seed_path', + '-s', + help = 'The location for the seed folders. Images generated with seed= are contained in /seed-.', + dest = 'seed_path', + type = str, + default = None + ) + + parser.add_argument( + '--merged_path', + '-m', + dest = 'merged_path', + help = 'The location of the merged dataset. The folder will contain two subfolders: fake and real.', + type = str + ) + + parser.add_argument( + '--visualize', + '-v', + help = 'whether to visualize a few pairs of fake and real images', + dest = 'visualize', + default = False, + type = bool, + ) + + return parser.parse_args() + + + +if __name__ == '__main__': + cmdargs = parse_cmdargs() + data_path = cmdargs.data_path + seed_path = cmdargs.seed_path + merged_path = cmdargs.merged_path + visualize = cmdargs.visualize + + # extract the image npz files from each tarfile and save them to folder according to their seed + # Each seed- folder looks like + # seed-/ + # - fake/ + # - real/ + + print('Extracting:') + assert Path(data_path).exists(), f"{data_path} does not exist" + + tarfnames = list(Path(data_path).glob('*.tar')) + assert len(tarfnames) > 0, f"{data_path} does not contain any tar files" + + if not seed_path: + seed_path = '/tmp/LS4GAN/toy-sp/seedwise/' + print('\033[96m' + f'Find seedwise data at {seed_path}' + '\033[0m') + + if not Path(seed_path).exists(): + Path(seed_path).mkdir(exist_ok=True, parents=True) + + for tarfname in tarfnames: + extract_toy_sp(tarfname, seed_path) + + + # Merge + # Modified the image filenames and then save copies to ``. + # The merged folder has two subfolders `fake` and `real`. + merged_folder = Path(merged_path) + merged_folder.mkdir(exist_ok=True, parents=True) + merged_fake = merged_folder/'fake' + merged_real = merged_folder/'real' + merged_fake.mkdir(exist_ok=True, parents=True) + merged_real.mkdir(exist_ok=True, parents=True) + + print('Merging:') + for folder in Path(seed_path).glob('seed-*'): + seed = folder.stem.split('-')[-1] + print(f'seed = {seed}') + + npz_fnames = list((folder/'fake').glob('*npz')) + print(f'\tcopy {len(npz_fnames)} to the {merged_fake} folder') + for npz_fname in sorted(npz_fnames): + npz_fname_new = str(npz_fname.name).replace('gauss', f'{seed}-') + shutil.copy(npz_fname, merged_fake/npz_fname_new) + + npz_fnames = list((folder/'real').glob('*npz')) + print(f'\tcopy {len(npz_fnames)} to the {merged_real} folder') + for npz_fname in sorted(npz_fnames): + npz_fname_new = str(npz_fname.name).replace('gauss', f'{seed}-') + shutil.copy(npz_fname, merged_real/npz_fname_new) + print() + + + # Visualization + if not visualize: + exit(0) + + print('Generating sample plots') + def load_image(fname): + with np.load(fname) as f: + return f[f.files[0]] + + fnames_fake = sorted((merged_fake).glob('*npz')) + fnames_real = sorted((merged_real).glob('*npz')) + num_samples = 5 + image_save_folder = Path(f'/tmp/LS4GAN/toy-sp/sample_plots') + if image_save_folder.exists(): + shutil.rmtree(image_save_folder) + image_save_folder.mkdir(exist_ok=True, parents=True) + print('\033[96m' + f'Find sample images at {image_save_folder}' + '\033[0m') + indices = np.random.choice(range(len(fnames_fake)), num_samples, replace=False) + + for idx in tqdm(indices): + image_fname = image_save_folder/f'sample_{idx}.png' + fname_fake = fnames_fake[idx] + fname_real = fnames_real[idx] + + image_fake = load_image(fname_fake) + image_real = load_image(fname_real) + + fig, axes = plt.subplots(1, 3, figsize=(20, 5)) + axes[0].pcolormesh(image_fake) + axes[1].pcolormesh(image_real) + axes[2].pcolormesh(image_fake - image_real) + plt.savefig(image_fname, dpi=200, bbox_inches='tight') diff --git a/toytools/datasets/presimple_toyzero.py b/toytools/datasets/presimple_toyzero.py index 5432e4e..84149e2 100644 --- a/toytools/datasets/presimple_toyzero.py +++ b/toytools/datasets/presimple_toyzero.py @@ -1,5 +1,7 @@ # pylint: disable=missing-module-docstring import os +import ctypes +import multiprocessing as mp import numpy as np import pandas as pd @@ -53,6 +55,7 @@ def __init__( shuffle = True, transform = None, val_size = 0.2, + is_in_mem = False, ): super().__init__(path) @@ -62,9 +65,12 @@ def __init__( self._transform = transform self._prg = np.random.default_rng(seed) self._val_size = val_size + self._is_in_mem = is_in_mem self._df = pd.read_csv(os.path.join(path, fname), index_col = 'index') - self._split_dataset() + self._df = self._split_dataset() + if is_in_mem: + self._shared_data = self._preload_data() def _split_dataset(self): """Split dataset into training/validation parts.""" @@ -77,12 +83,20 @@ def _split_dataset(self): else: indices = val_indices - self._df = self._df.iloc[indices] + return self._df.iloc[indices] - def __len__(self): - return len(self._df) + def _preload_data(self): + data_sz = len(self._df) + img, _ = self._load_image_pair(0) + h, w = img.shape + shared_alloc = mp.Array(ctypes.c_float, data_sz * 2 * h * w) + shared_data = np.ctypeslib.as_array(shared_alloc.get_obj()) + shared_data = shared_data.reshape(data_sz, 2, h, w) + for i in range(data_sz): + shared_data[i][0], shared_data[i][1] = self._load_image_pair(i) + return shared_data - def __getitem__(self, index): + def _load_image_pair(self, index): sample = self._df.iloc[index] image_fake = load_image(self._path, True, sample.image) @@ -96,8 +110,18 @@ def __getitem__(self, index): images = [ (x - sample.bkg) for x in images ] images = [ x.astype(np.float32) for x in images ] + return images + + def __len__(self): + return len(self._df) + + def __getitem__(self, index): + if self._is_in_mem: + images = [self._shared_data[index][0], self._shared_data[index][1]] + else: + images = self._load_image_pair(index) + if self._transform is not None: images = [ self._transform(x) for x in images ] return images -