Skip to content

Commit

Permalink
Merge pull request #181 from uio-bmi/fix_dataset_import
Browse files Browse the repository at this point in the history
Fix dataset import
  • Loading branch information
LonnekeScheffer authored Dec 20, 2024
2 parents 717d059 + b7331b4 commit 4f66aac
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 94 deletions.
12 changes: 8 additions & 4 deletions docs/source/_static/files/quickstart/galaxy/quickstart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
definitions:
datasets:
my_dataset: # user-defined dataset name
format: Pickle
format: AIRR
params:
path: dataset.yaml # 'dataset' is the default name given by the Create dataset tool
dataset_file: dataset.yaml # 'dataset' is the default name given by the Create dataset tool

encodings:
my_kmer_frequency: # user-defined encoding name
Expand All @@ -14,7 +14,8 @@ definitions:
my_logistic_regression: LogisticRegression # user-defined ML model name: ML model type (no user-specified parameters)

reports:
my_coefficients: Coefficients # user-defined report name: report type (no user-specified parameters)
my_benchmark: MLSettingsPerformance # user-defined report name: report type (no user-specified parameters)
my_coefficients: Coefficients

instructions:
my_training_instruction: # user-defined instruction name
Expand All @@ -41,10 +42,13 @@ instructions:
split_count: 1
training_percentage: 1 # use all data for training

reports:
- my_benchmark

optimization_metric: balanced_accuracy # the metric to optimize during nested cross-validation when comparing multiple models
metrics: # other metrics to compute for reference
- auc
- precision
- recall

number_of_processes: 4 # processes for parallelization
number_of_processes: 8 # processes for parallelization
108 changes: 65 additions & 43 deletions immuneML/IO/dataset_import/DataImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet, AminoAcidXEncoding, DNANEncoding
from immuneML.data_model.SequenceSet import Repertoire, build_dynamic_airr_sequence_set_dataclass
from immuneML.data_model.bnp_util import bnp_write_to_file, write_yaml, read_yaml
from immuneML.data_model.bnp_util import bnp_write_to_file, write_yaml, read_yaml, write_dataset_yaml
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.data_model.datasets.ElementDataset import SequenceDataset, ReceptorDataset, ElementDataset
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
Expand Down Expand Up @@ -63,11 +63,43 @@ def import_dataset_from_yaml(self):
return dataset

def import_repertoire_dataset(self) -> RepertoireDataset:
if self.params.dataset_file is not None and self.params.dataset_file.is_file():
imported_dataset_yaml = read_yaml(self.params.dataset_file)

self.params.path = self.params.dataset_file.parent

if self.params.metadata_file is None:
self.params.metadata_file = self.params.dataset_file.parent / imported_dataset_yaml['metadata_file']

imported_identifier = imported_dataset_yaml['identifier']
imported_labels = imported_dataset_yaml['labels']

# type_dict = imported_dataset_yaml['type_dict']
else:
imported_labels = None
imported_identifier = None

self.check_or_discover_metadata_file()
metadata = self.import_repertoire_metadata(self.params.metadata_file)
repertoires = self.load_repertoires(metadata)
new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, self.params.result_path, self.dataset_name)
# type_dict = self.determine_repertoire_type_dict(repertoires)
labels = self.determine_repertoire_dataset_labels(metadata, imported_labels=imported_labels)

dataset_filename = self.params.result_path / f"{self.dataset_name}.yaml"
dataset_yaml = RepertoireDataset.create_metadata_dict(metadata_file=new_metadata_file,
# type_dict=type_dict,
labels=labels,
name=self.dataset_name,
identifier=imported_identifier)
write_dataset_yaml(dataset_filename, dataset_yaml)

return RepertoireDataset(labels=labels,
repertoires=repertoires, metadata_file=new_metadata_file, name=self.dataset_name,
dataset_file=dataset_filename, identifier=dataset_yaml["identifier"])

def import_repertoire_metadata(self, metadata_file_path):
try:
metadata = pd.read_csv(self.params.metadata_file, sep=",")
metadata = pd.read_csv(metadata_file_path, sep=",")
if "identifier" in metadata.columns:
assert len(list(metadata["identifier"])) == len(set(list(metadata["identifier"]))), \
(f"DataImport: if the field 'identifier' is supplied, each repertoire must have "
Expand All @@ -80,27 +112,45 @@ def import_repertoire_dataset(self) -> RepertoireDataset:

ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], self.__class__.__name__,
f'{self.dataset_name}: params: metadata_file')
return metadata

def load_repertoires(self, metadata):
PathBuilder.build(self.params.result_path / "repertoires/")

with Pool(self.params.number_of_processes) as pool:
repertoires = pool.map(self.load_repertoire_object, [row for _, row in metadata.iterrows()])

new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, self.params.result_path,
self.dataset_name)
return repertoires

def determine_repertoire_type_dict(self, repertoires):
try:
if all(repertoires[0].metadata['type_dict_dynamic_fields'] == rep.metadata['type_dict_dynamic_fields'] for
rep
in repertoires[1:]):
return repertoires[0].metadata['type_dict_dynamic_fields']
else:
raise RuntimeError()
except Exception as e:
logging.warning(f'{DataImport.__name__}: dynamic fields for the dataset {self.dataset_name} could not be '
f'extracted, some repertoires have different fields.')
return {}

potential_labels = list(set(metadata.columns.tolist()) - {"filename", 'type_dict_dynamic_fields'})
dataset_filename, dataset_file_content = self._make_dataset_file_for_repertoire_dataset(repertoires)
def determine_repertoire_dataset_labels(self, metadata, imported_labels=None):
potential_label_names = list(set(metadata.columns.tolist()) - {"filename", "type_dict_dynamic_fields", "identifier", "subject_id"})
potential_labels = {key: list(set(metadata[key].values.tolist())) for key in potential_label_names}

if 'labels' in dataset_file_content and dataset_file_content['labels']:
if any(label not in potential_labels for label in dataset_file_content['labels']):
logging.warning(f"{DataImport.__name__}: {self.dataset_name}: an error occurred when importing "
f"dataset. Labels specified in the dataset file could not be found in the repertoire "
if imported_labels is not None:
labels = imported_labels
if any(label not in potential_label_names for label in imported_labels):
logging.warning(f"{DataImport.__name__}: an error occurred when importing dataset {self.dataset_name}. "
f"Labels specified in the dataset file ({imported_labels}) could not be found in the repertoire "
f"fields. Proceeding with the following labels: {potential_labels}.")
labels = potential_labels
else:
labels = potential_labels

return labels

return RepertoireDataset(labels={key: list(set(metadata[key].values.tolist())) for key in potential_labels},
repertoires=repertoires, metadata_file=new_metadata_file, name=self.dataset_name,
dataset_file=dataset_filename)

def import_element_dataset(self, dataset_class: Type, filter_func=None):
if self.params.dataset_file is not None and self.params.dataset_file.is_file():
Expand Down Expand Up @@ -136,34 +186,6 @@ def import_sequence_dataset(self) -> SequenceDataset:
def import_receptor_dataset(self) -> ReceptorDataset:
return self.import_element_dataset(ReceptorDataset, ImportHelper.filter_illegal_receptors)

def check_or_discover_metadata_file(self):
if self.params.metadata_file is None and self.params.dataset_file and self.params.dataset_file.is_file():
dataset_metadata = read_yaml(self.params.dataset_file)
if 'metadata_file' in dataset_metadata:
self.params.metadata_file = self.params.dataset_file.parent / dataset_metadata['metadata_file']

def _make_dataset_file_for_repertoire_dataset(self, repertoires: List[Repertoire]) -> Tuple[Path, dict]:
dataset_filename = self.params.result_path / f"{self.dataset_name}.yaml"

metadata = read_yaml(self.params.dataset_file) if self.params.dataset_file else {}

metadata = {**{'dataset_name': self.dataset_name, 'example_count': len(repertoires)}, **metadata}

try:
if all(repertoires[0].metadata['type_dict_dynamic_fields'] == rep.metadata['type_dict_dynamic_fields'] for
rep
in repertoires[1:]):
metadata['type_dict_dynamic_fields'] = repertoires[0].metadata['type_dict_dynamic_fields']
else:
raise RuntimeError()
except Exception as e:
logging.warning(f'{DataImport.__name__}: dynamic fields for the dataset {self.dataset_name} could not be '
f'extracted, some repertoires have different fields.')

write_yaml(dataset_filename, metadata)

return dataset_filename, metadata

def _construct_element_dataset_data_dict(self, filenames, filter_func) -> dict:
final_df = None

Expand Down Expand Up @@ -195,7 +217,7 @@ def _write_element_dataset_metadata_file(self, dataset_class, filename, type_dic
type_dict=type_dict,
name=self.dataset_name,
labels=possible_labels)
write_yaml(dataset_filename, metadata)
write_dataset_yaml(dataset_filename, metadata)
return dataset_filename

def load_repertoire_object(self, metadata_row: pd.Series) -> Repertoire:
Expand Down
14 changes: 14 additions & 0 deletions immuneML/data_model/bnp_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,20 @@ def bnp_read_from_file(filename: Path, buffer_type: bnp.io.delimited_buffers.Del
with bnp.open(str(filename), buffer_type=buffer_type) as file:
return file.read() # TODO: fix - throws error when empty file (no lines after header)

def write_dataset_yaml(filename: Path, yaml_dict):
for mandatory_field in ["identifier", "dataset_type", "name", "labels"]:
assert mandatory_field in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field {mandatory_field}"

if yaml_dict["dataset_type"] == "RepertoireDataset":
assert "metadata_file" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field metadata_file"

if yaml_dict["dataset_type"] in ("SequenceDataset", "ReceptorDataset"):
assert "filename" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field filename"
assert "type_dict_dynamic_fields" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field type_dict_dynamic_fields"

assert type(yaml_dict["labels"]) == dict or type(yaml_dict["labels"]) == None, "labels format must be dict or None"

write_yaml(filename, yaml_dict)

def write_yaml(filename: Path, yaml_dict):
with filename.open('w') as file:
Expand Down
12 changes: 6 additions & 6 deletions immuneML/data_model/datasets/ElementDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.SequenceSet import Receptor, ReceptorSequence, AIRRSequenceSet, \
build_dynamic_airr_sequence_set_dataclass, make_receptors_from_data, make_sequences_from_data
from immuneML.data_model.bnp_util import write_yaml, bnp_write_to_file, bnp_read_from_file, read_yaml, \
extend_dataclass_with_dynamic_fields
from immuneML.data_model.bnp_util import bnp_write_to_file, bnp_read_from_file, read_yaml, \
extend_dataclass_with_dynamic_fields, write_dataset_yaml
from immuneML.data_model.datasets.Dataset import Dataset


Expand Down Expand Up @@ -49,7 +49,7 @@ def create_metadata_dict(cls, dataset_class, filename, type_dict, name, labels,
"dataset_type": dataset_class if isinstance(dataset_class, str) else dataset_class.__name__,
"filename": filename,
"name": name,
"labels": labels,
"labels": {} if labels is None else labels,
"timestamp": str(datetime.now())}

@property
Expand Down Expand Up @@ -113,7 +113,7 @@ def build_from_objects(cls, sequences: List[ReceptorSequence], path: Path, name:
labels=labels)

metadata_filename = path / f'{name}.yaml'
write_yaml(metadata_filename, dataset_metadata)
write_dataset_yaml(metadata_filename, dataset_metadata)

return SequenceDataset(filename=filename, name=name, labels=labels, dynamic_fields=type_dict,
dataset_file=metadata_filename, bnp_dataclass=bnp_dc,
Expand Down Expand Up @@ -192,7 +192,7 @@ def build_from_objects(cls, receptors: List[Receptor], path: Path, name: str = N
name=name,
labels=labels)

write_yaml(metadata_filename, dataset_metadata)
write_dataset_yaml(metadata_filename, dataset_metadata)

return ReceptorDataset(filename=filename, name=name, labels=labels, dynamic_fields=type_dict,
dataset_file=metadata_filename, bnp_dataclass=bnp_dc,
Expand Down Expand Up @@ -220,7 +220,7 @@ def make_subset(self, example_indices, path, dataset_type: str):

metadata_filename = path / f'{name}.yaml'
metadata = read_yaml(self.dataset_file)
write_yaml(metadata_filename, {
write_dataset_yaml(metadata_filename, {
**metadata, **{'filename': f"{name}.tsv", 'name': name}
})

Expand Down
22 changes: 11 additions & 11 deletions immuneML/data_model/datasets/RepertoireDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from immuneML import Constants
from immuneML.data_model.EncodedData import EncodedData
from immuneML.data_model.SequenceSet import Repertoire
from immuneML.data_model.bnp_util import write_yaml
from immuneML.data_model.bnp_util import write_yaml, write_dataset_yaml
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
Expand Down Expand Up @@ -44,15 +44,15 @@ def build_from_objects(cls, **kwargs):
dataset.labels = {label: list(set(values)) for label, values in dataset.get_metadata(label_names).items()}

dataset_file = PathBuilder.build(kwargs['path']) / 'dataset.yaml'
dataset_meta_content = cls.create_metadata_dict(type_dict={k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields']
for rep in kwargs['repertoires']]
for k, v in tmp_dict.items()},
labels=dataset.labels,
dataset_meta_content = cls.create_metadata_dict(labels=dataset.labels,
identifier=dataset.identifier,
name=dataset.name,
metadata_file=str(metadata_path.name))
#type_dict={k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields']
# for rep in kwargs['repertoires']]
# for k, v in tmp_dict.items()},

write_yaml(dataset_file, dataset_meta_content)
write_dataset_yaml(dataset_file, dataset_meta_content)
dataset.dataset_file = dataset_file

return dataset
Expand Down Expand Up @@ -82,12 +82,12 @@ def build(cls, **kwargs):
return RepertoireDataset(**{**kwargs, **{"repertoires": repertoires}})

@classmethod
def create_metadata_dict(cls, metadata_file, type_dict, labels, identifier, name):
return {"metadata_file": metadata_file,
"type_dict_dynamic_fields": type_dict,
"labels": labels,
'identifier': identifier,
def create_metadata_dict(cls, metadata_file, labels, name, identifier=None):
return {"metadata_file": Path(metadata_file).name,
# "type_dict_dynamic_fields": type_dict,
"labels": {} if labels is None else labels,
"name": name,
"identifier": identifier if identifier is not None else uuid4().hex,
"dataset_type": cls.__name__,
"timestamp": datetime.now()}

Expand Down
2 changes: 1 addition & 1 deletion immuneML/environment/Constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class Constants:

VERSION = "3.0.5"
VERSION = "3.0.6"

# encoding constants
FEATURE_DELIMITER = "-"
Expand Down
20 changes: 12 additions & 8 deletions immuneML/ml_methods/generative_models/PWM.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from datetime import datetime
from pathlib import Path
from typing import List
from uuid import uuid4

import numpy as np
import pandas as pd

from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet
from immuneML.data_model.bnp_util import write_yaml, read_yaml, get_sequence_field_name, make_full_airr_seq_set_df
from immuneML.data_model.bnp_util import write_yaml, read_yaml, get_sequence_field_name, make_full_airr_seq_set_df, \
write_dataset_yaml
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.data_model.datasets.ElementDataset import SequenceDataset
from immuneML.data_model.SequenceParams import RegionType
Expand Down Expand Up @@ -145,15 +147,17 @@ def _export_gen_dataset(self, sequences: List[str], path: Path) -> SequenceDatas
'gen_model_name': [self.name for _ in range(count)]})

df = make_full_airr_seq_set_df(df)
filename = str(PathBuilder.build(path) / 'synthetic_dataset.tsv')

df.to_csv(str(PathBuilder.build(path) / 'synthetic_dataset.tsv'), sep='\t', index=False)
df.to_csv(filename, sep='\t', index=False)

write_yaml(path / 'synthetic_metadata.yaml', {
'dataset_type': 'SequenceDataset',
'type_dict_dynamic_fields': {'gen_model_name': 'str'},
'name': 'synthetic_dataset', 'labels': {'gen_model_name': [self.name]},
'timestamp': str(datetime.now())
})
dataset_yaml = SequenceDataset.create_metadata_dict(SequenceDataset,
filename=filename,
type_dict={'gen_model_name': str},
name="synthetic_dataset",
labels={'gen_model_name': [self.name]})

write_dataset_yaml(path / 'synthetic_metadata.yaml', dataset_yaml)

return SequenceDataset.build(path / 'synthetic_dataset.tsv', path / 'synthetic_metadata.yaml',
'synthetic_dataset')
Expand Down
2 changes: 1 addition & 1 deletion immuneML/ml_methods/generative_models/SimpleLSTM.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def save_model(self, path: Path) -> Path:
write_yaml(filename=model_path / 'model_overview.yaml',
yaml_dict={**{k: v for k, v in vars(self).items() if k not in skip_keys_for_export},
**{'type': self.__class__.__name__, 'region_type': self.region_type.name,
'sequence_type': self.sequence_type.name, 'locus': self.locus.name}}) # todo add 'dataset_type': 'SequenceDataset',
'sequence_type': self.sequence_type.name, 'locus': self.locus.name}})

store_weights(self._model, model_path / 'state_dict.yaml')

Expand Down
Loading

0 comments on commit 4f66aac

Please sign in to comment.