diff --git a/docs/source/_static/files/quickstart/galaxy/quickstart.yaml b/docs/source/_static/files/quickstart/galaxy/quickstart.yaml index 70134b18c..ace30fa9f 100644 --- a/docs/source/_static/files/quickstart/galaxy/quickstart.yaml +++ b/docs/source/_static/files/quickstart/galaxy/quickstart.yaml @@ -1,9 +1,9 @@ definitions: datasets: my_dataset: # user-defined dataset name - format: Pickle + format: AIRR params: - path: dataset.yaml # 'dataset' is the default name given by the Create dataset tool + dataset_file: dataset.yaml # 'dataset' is the default name given by the Create dataset tool encodings: my_kmer_frequency: # user-defined encoding name @@ -14,7 +14,8 @@ definitions: my_logistic_regression: LogisticRegression # user-defined ML model name: ML model type (no user-specified parameters) reports: - my_coefficients: Coefficients # user-defined report name: report type (no user-specified parameters) + my_benchmark: MLSettingsPerformance # user-defined report name: report type (no user-specified parameters) + my_coefficients: Coefficients instructions: my_training_instruction: # user-defined instruction name @@ -41,10 +42,13 @@ instructions: split_count: 1 training_percentage: 1 # use all data for training + reports: + - my_benchmark + optimization_metric: balanced_accuracy # the metric to optimize during nested cross-validation when comparing multiple models metrics: # other metrics to compute for reference - auc - precision - recall - number_of_processes: 4 # processes for parallelization \ No newline at end of file + number_of_processes: 8 # processes for parallelization \ No newline at end of file diff --git a/immuneML/IO/dataset_import/DataImport.py b/immuneML/IO/dataset_import/DataImport.py index ac50bf7df..2af1ca50a 100644 --- a/immuneML/IO/dataset_import/DataImport.py +++ b/immuneML/IO/dataset_import/DataImport.py @@ -13,7 +13,7 @@ from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet, AminoAcidXEncoding, DNANEncoding from immuneML.data_model.SequenceSet import Repertoire, build_dynamic_airr_sequence_set_dataclass -from immuneML.data_model.bnp_util import bnp_write_to_file, write_yaml, read_yaml +from immuneML.data_model.bnp_util import bnp_write_to_file, write_yaml, read_yaml, write_dataset_yaml from immuneML.data_model.datasets.Dataset import Dataset from immuneML.data_model.datasets.ElementDataset import SequenceDataset, ReceptorDataset, ElementDataset from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset @@ -63,11 +63,43 @@ def import_dataset_from_yaml(self): return dataset def import_repertoire_dataset(self) -> RepertoireDataset: + if self.params.dataset_file is not None and self.params.dataset_file.is_file(): + imported_dataset_yaml = read_yaml(self.params.dataset_file) + + self.params.path = self.params.dataset_file.parent + + if self.params.metadata_file is None: + self.params.metadata_file = self.params.dataset_file.parent / imported_dataset_yaml['metadata_file'] + + imported_identifier = imported_dataset_yaml['identifier'] + imported_labels = imported_dataset_yaml['labels'] + + # type_dict = imported_dataset_yaml['type_dict'] + else: + imported_labels = None + imported_identifier = None - self.check_or_discover_metadata_file() + metadata = self.import_repertoire_metadata(self.params.metadata_file) + repertoires = self.load_repertoires(metadata) + new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, self.params.result_path, self.dataset_name) + # type_dict = self.determine_repertoire_type_dict(repertoires) + labels = self.determine_repertoire_dataset_labels(metadata, imported_labels=imported_labels) + dataset_filename = self.params.result_path / f"{self.dataset_name}.yaml" + dataset_yaml = RepertoireDataset.create_metadata_dict(metadata_file=new_metadata_file, + # type_dict=type_dict, + labels=labels, + name=self.dataset_name, + identifier=imported_identifier) + write_dataset_yaml(dataset_filename, dataset_yaml) + + return RepertoireDataset(labels=labels, + repertoires=repertoires, metadata_file=new_metadata_file, name=self.dataset_name, + dataset_file=dataset_filename, identifier=dataset_yaml["identifier"]) + + def import_repertoire_metadata(self, metadata_file_path): try: - metadata = pd.read_csv(self.params.metadata_file, sep=",") + metadata = pd.read_csv(metadata_file_path, sep=",") if "identifier" in metadata.columns: assert len(list(metadata["identifier"])) == len(set(list(metadata["identifier"]))), \ (f"DataImport: if the field 'identifier' is supplied, each repertoire must have " @@ -80,27 +112,45 @@ def import_repertoire_dataset(self) -> RepertoireDataset: ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], self.__class__.__name__, f'{self.dataset_name}: params: metadata_file') + return metadata + def load_repertoires(self, metadata): PathBuilder.build(self.params.result_path / "repertoires/") with Pool(self.params.number_of_processes) as pool: repertoires = pool.map(self.load_repertoire_object, [row for _, row in metadata.iterrows()]) - new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, self.params.result_path, - self.dataset_name) + return repertoires + + def determine_repertoire_type_dict(self, repertoires): + try: + if all(repertoires[0].metadata['type_dict_dynamic_fields'] == rep.metadata['type_dict_dynamic_fields'] for + rep + in repertoires[1:]): + return repertoires[0].metadata['type_dict_dynamic_fields'] + else: + raise RuntimeError() + except Exception as e: + logging.warning(f'{DataImport.__name__}: dynamic fields for the dataset {self.dataset_name} could not be ' + f'extracted, some repertoires have different fields.') + return {} - potential_labels = list(set(metadata.columns.tolist()) - {"filename", 'type_dict_dynamic_fields'}) - dataset_filename, dataset_file_content = self._make_dataset_file_for_repertoire_dataset(repertoires) + def determine_repertoire_dataset_labels(self, metadata, imported_labels=None): + potential_label_names = list(set(metadata.columns.tolist()) - {"filename", "type_dict_dynamic_fields", "identifier", "subject_id"}) + potential_labels = {key: list(set(metadata[key].values.tolist())) for key in potential_label_names} - if 'labels' in dataset_file_content and dataset_file_content['labels']: - if any(label not in potential_labels for label in dataset_file_content['labels']): - logging.warning(f"{DataImport.__name__}: {self.dataset_name}: an error occurred when importing " - f"dataset. Labels specified in the dataset file could not be found in the repertoire " + if imported_labels is not None: + labels = imported_labels + if any(label not in potential_label_names for label in imported_labels): + logging.warning(f"{DataImport.__name__}: an error occurred when importing dataset {self.dataset_name}. " + f"Labels specified in the dataset file ({imported_labels}) could not be found in the repertoire " f"fields. Proceeding with the following labels: {potential_labels}.") + labels = potential_labels + else: + labels = potential_labels + + return labels - return RepertoireDataset(labels={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, - repertoires=repertoires, metadata_file=new_metadata_file, name=self.dataset_name, - dataset_file=dataset_filename) def import_element_dataset(self, dataset_class: Type, filter_func=None): if self.params.dataset_file is not None and self.params.dataset_file.is_file(): @@ -136,34 +186,6 @@ def import_sequence_dataset(self) -> SequenceDataset: def import_receptor_dataset(self) -> ReceptorDataset: return self.import_element_dataset(ReceptorDataset, ImportHelper.filter_illegal_receptors) - def check_or_discover_metadata_file(self): - if self.params.metadata_file is None and self.params.dataset_file and self.params.dataset_file.is_file(): - dataset_metadata = read_yaml(self.params.dataset_file) - if 'metadata_file' in dataset_metadata: - self.params.metadata_file = self.params.dataset_file.parent / dataset_metadata['metadata_file'] - - def _make_dataset_file_for_repertoire_dataset(self, repertoires: List[Repertoire]) -> Tuple[Path, dict]: - dataset_filename = self.params.result_path / f"{self.dataset_name}.yaml" - - metadata = read_yaml(self.params.dataset_file) if self.params.dataset_file else {} - - metadata = {**{'dataset_name': self.dataset_name, 'example_count': len(repertoires)}, **metadata} - - try: - if all(repertoires[0].metadata['type_dict_dynamic_fields'] == rep.metadata['type_dict_dynamic_fields'] for - rep - in repertoires[1:]): - metadata['type_dict_dynamic_fields'] = repertoires[0].metadata['type_dict_dynamic_fields'] - else: - raise RuntimeError() - except Exception as e: - logging.warning(f'{DataImport.__name__}: dynamic fields for the dataset {self.dataset_name} could not be ' - f'extracted, some repertoires have different fields.') - - write_yaml(dataset_filename, metadata) - - return dataset_filename, metadata - def _construct_element_dataset_data_dict(self, filenames, filter_func) -> dict: final_df = None @@ -195,7 +217,7 @@ def _write_element_dataset_metadata_file(self, dataset_class, filename, type_dic type_dict=type_dict, name=self.dataset_name, labels=possible_labels) - write_yaml(dataset_filename, metadata) + write_dataset_yaml(dataset_filename, metadata) return dataset_filename def load_repertoire_object(self, metadata_row: pd.Series) -> Repertoire: diff --git a/immuneML/data_model/bnp_util.py b/immuneML/data_model/bnp_util.py index 106ff554b..eed5b59d5 100644 --- a/immuneML/data_model/bnp_util.py +++ b/immuneML/data_model/bnp_util.py @@ -32,6 +32,20 @@ def bnp_read_from_file(filename: Path, buffer_type: bnp.io.delimited_buffers.Del with bnp.open(str(filename), buffer_type=buffer_type) as file: return file.read() # TODO: fix - throws error when empty file (no lines after header) +def write_dataset_yaml(filename: Path, yaml_dict): + for mandatory_field in ["identifier", "dataset_type", "name", "labels"]: + assert mandatory_field in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field {mandatory_field}" + + if yaml_dict["dataset_type"] == "RepertoireDataset": + assert "metadata_file" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field metadata_file" + + if yaml_dict["dataset_type"] in ("SequenceDataset", "ReceptorDataset"): + assert "filename" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field filename" + assert "type_dict_dynamic_fields" in yaml_dict.keys(), f"Error exporting {filename.stem}: missing field type_dict_dynamic_fields" + + assert type(yaml_dict["labels"]) == dict or type(yaml_dict["labels"]) == None, "labels format must be dict or None" + + write_yaml(filename, yaml_dict) def write_yaml(filename: Path, yaml_dict): with filename.open('w') as file: diff --git a/immuneML/data_model/datasets/ElementDataset.py b/immuneML/data_model/datasets/ElementDataset.py index 26a8cee7c..649cbfc70 100644 --- a/immuneML/data_model/datasets/ElementDataset.py +++ b/immuneML/data_model/datasets/ElementDataset.py @@ -16,8 +16,8 @@ from immuneML.data_model.SequenceParams import RegionType from immuneML.data_model.SequenceSet import Receptor, ReceptorSequence, AIRRSequenceSet, \ build_dynamic_airr_sequence_set_dataclass, make_receptors_from_data, make_sequences_from_data -from immuneML.data_model.bnp_util import write_yaml, bnp_write_to_file, bnp_read_from_file, read_yaml, \ - extend_dataclass_with_dynamic_fields +from immuneML.data_model.bnp_util import bnp_write_to_file, bnp_read_from_file, read_yaml, \ + extend_dataclass_with_dynamic_fields, write_dataset_yaml from immuneML.data_model.datasets.Dataset import Dataset @@ -49,7 +49,7 @@ def create_metadata_dict(cls, dataset_class, filename, type_dict, name, labels, "dataset_type": dataset_class if isinstance(dataset_class, str) else dataset_class.__name__, "filename": filename, "name": name, - "labels": labels, + "labels": {} if labels is None else labels, "timestamp": str(datetime.now())} @property @@ -113,7 +113,7 @@ def build_from_objects(cls, sequences: List[ReceptorSequence], path: Path, name: labels=labels) metadata_filename = path / f'{name}.yaml' - write_yaml(metadata_filename, dataset_metadata) + write_dataset_yaml(metadata_filename, dataset_metadata) return SequenceDataset(filename=filename, name=name, labels=labels, dynamic_fields=type_dict, dataset_file=metadata_filename, bnp_dataclass=bnp_dc, @@ -192,7 +192,7 @@ def build_from_objects(cls, receptors: List[Receptor], path: Path, name: str = N name=name, labels=labels) - write_yaml(metadata_filename, dataset_metadata) + write_dataset_yaml(metadata_filename, dataset_metadata) return ReceptorDataset(filename=filename, name=name, labels=labels, dynamic_fields=type_dict, dataset_file=metadata_filename, bnp_dataclass=bnp_dc, @@ -220,7 +220,7 @@ def make_subset(self, example_indices, path, dataset_type: str): metadata_filename = path / f'{name}.yaml' metadata = read_yaml(self.dataset_file) - write_yaml(metadata_filename, { + write_dataset_yaml(metadata_filename, { **metadata, **{'filename': f"{name}.tsv", 'name': name} }) diff --git a/immuneML/data_model/datasets/RepertoireDataset.py b/immuneML/data_model/datasets/RepertoireDataset.py index 6e945b8b3..93fdd210d 100644 --- a/immuneML/data_model/datasets/RepertoireDataset.py +++ b/immuneML/data_model/datasets/RepertoireDataset.py @@ -10,7 +10,7 @@ from immuneML import Constants from immuneML.data_model.EncodedData import EncodedData from immuneML.data_model.SequenceSet import Repertoire -from immuneML.data_model.bnp_util import write_yaml +from immuneML.data_model.bnp_util import write_yaml, write_dataset_yaml from immuneML.data_model.datasets.Dataset import Dataset from immuneML.util.ParameterValidator import ParameterValidator from immuneML.util.PathBuilder import PathBuilder @@ -44,15 +44,15 @@ def build_from_objects(cls, **kwargs): dataset.labels = {label: list(set(values)) for label, values in dataset.get_metadata(label_names).items()} dataset_file = PathBuilder.build(kwargs['path']) / 'dataset.yaml' - dataset_meta_content = cls.create_metadata_dict(type_dict={k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields'] - for rep in kwargs['repertoires']] - for k, v in tmp_dict.items()}, - labels=dataset.labels, + dataset_meta_content = cls.create_metadata_dict(labels=dataset.labels, identifier=dataset.identifier, name=dataset.name, metadata_file=str(metadata_path.name)) + #type_dict={k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields'] + # for rep in kwargs['repertoires']] + # for k, v in tmp_dict.items()}, - write_yaml(dataset_file, dataset_meta_content) + write_dataset_yaml(dataset_file, dataset_meta_content) dataset.dataset_file = dataset_file return dataset @@ -82,12 +82,12 @@ def build(cls, **kwargs): return RepertoireDataset(**{**kwargs, **{"repertoires": repertoires}}) @classmethod - def create_metadata_dict(cls, metadata_file, type_dict, labels, identifier, name): - return {"metadata_file": metadata_file, - "type_dict_dynamic_fields": type_dict, - "labels": labels, - 'identifier': identifier, + def create_metadata_dict(cls, metadata_file, labels, name, identifier=None): + return {"metadata_file": Path(metadata_file).name, + # "type_dict_dynamic_fields": type_dict, + "labels": {} if labels is None else labels, "name": name, + "identifier": identifier if identifier is not None else uuid4().hex, "dataset_type": cls.__name__, "timestamp": datetime.now()} diff --git a/immuneML/environment/Constants.py b/immuneML/environment/Constants.py index 361bef25a..c0e11086d 100644 --- a/immuneML/environment/Constants.py +++ b/immuneML/environment/Constants.py @@ -1,6 +1,6 @@ class Constants: - VERSION = "3.0.5" + VERSION = "3.0.6" # encoding constants FEATURE_DELIMITER = "-" diff --git a/immuneML/ml_methods/generative_models/PWM.py b/immuneML/ml_methods/generative_models/PWM.py index ec8860af9..c2a33c7f8 100644 --- a/immuneML/ml_methods/generative_models/PWM.py +++ b/immuneML/ml_methods/generative_models/PWM.py @@ -2,12 +2,14 @@ from datetime import datetime from pathlib import Path from typing import List +from uuid import uuid4 import numpy as np import pandas as pd from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet -from immuneML.data_model.bnp_util import write_yaml, read_yaml, get_sequence_field_name, make_full_airr_seq_set_df +from immuneML.data_model.bnp_util import write_yaml, read_yaml, get_sequence_field_name, make_full_airr_seq_set_df, \ + write_dataset_yaml from immuneML.data_model.datasets.Dataset import Dataset from immuneML.data_model.datasets.ElementDataset import SequenceDataset from immuneML.data_model.SequenceParams import RegionType @@ -145,15 +147,17 @@ def _export_gen_dataset(self, sequences: List[str], path: Path) -> SequenceDatas 'gen_model_name': [self.name for _ in range(count)]}) df = make_full_airr_seq_set_df(df) + filename = str(PathBuilder.build(path) / 'synthetic_dataset.tsv') - df.to_csv(str(PathBuilder.build(path) / 'synthetic_dataset.tsv'), sep='\t', index=False) + df.to_csv(filename, sep='\t', index=False) - write_yaml(path / 'synthetic_metadata.yaml', { - 'dataset_type': 'SequenceDataset', - 'type_dict_dynamic_fields': {'gen_model_name': 'str'}, - 'name': 'synthetic_dataset', 'labels': {'gen_model_name': [self.name]}, - 'timestamp': str(datetime.now()) - }) + dataset_yaml = SequenceDataset.create_metadata_dict(SequenceDataset, + filename=filename, + type_dict={'gen_model_name': str}, + name="synthetic_dataset", + labels={'gen_model_name': [self.name]}) + + write_dataset_yaml(path / 'synthetic_metadata.yaml', dataset_yaml) return SequenceDataset.build(path / 'synthetic_dataset.tsv', path / 'synthetic_metadata.yaml', 'synthetic_dataset') diff --git a/immuneML/ml_methods/generative_models/SimpleLSTM.py b/immuneML/ml_methods/generative_models/SimpleLSTM.py index 45ea88436..a7ae97437 100644 --- a/immuneML/ml_methods/generative_models/SimpleLSTM.py +++ b/immuneML/ml_methods/generative_models/SimpleLSTM.py @@ -251,7 +251,7 @@ def save_model(self, path: Path) -> Path: write_yaml(filename=model_path / 'model_overview.yaml', yaml_dict={**{k: v for k, v in vars(self).items() if k not in skip_keys_for_export}, **{'type': self.__class__.__name__, 'region_type': self.region_type.name, - 'sequence_type': self.sequence_type.name, 'locus': self.locus.name}}) # todo add 'dataset_type': 'SequenceDataset', + 'sequence_type': self.sequence_type.name, 'locus': self.locus.name}}) store_weights(self._model, model_path / 'state_dict.yaml') diff --git a/immuneML/util/RepertoireBuilder.py b/immuneML/util/RepertoireBuilder.py index 5cf78b76f..fe03db5d5 100644 --- a/immuneML/util/RepertoireBuilder.py +++ b/immuneML/util/RepertoireBuilder.py @@ -8,7 +8,8 @@ from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet from immuneML.data_model.SequenceSet import ReceptorSequence, Repertoire -from immuneML.data_model.bnp_util import write_yaml, build_dynamic_bnp_dataclass_obj, make_full_airr_seq_set_df +from immuneML.data_model.bnp_util import build_dynamic_bnp_dataclass_obj, make_full_airr_seq_set_df, \ + write_dataset_yaml, write_yaml from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset from immuneML.util.PathBuilder import PathBuilder @@ -94,19 +95,18 @@ def build_dataset(sequences: list, path: Path, labels: dict = None, seq_metadata subject_ids: list = None, name: str = "d1"): reps, metadata_file = RepertoireBuilder.build(sequences, path, labels, seq_metadata, subject_ids, name) - type_dict = {k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields'] for rep in reps] - for k, v in tmp_dict.items()} + # type_dict = {k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields'] for rep in reps] + # for k, v in tmp_dict.items()} labels_unique = {k: list(set(v)) for k, v in labels.items()} if isinstance(labels, dict) else {} identifier = uuid.uuid4().hex - metadata_yaml = RepertoireDataset.create_metadata_dict(type_dict=type_dict, - labels=labels_unique, + metadata_yaml = RepertoireDataset.create_metadata_dict(labels=labels_unique, identifier=identifier, metadata_file=str(metadata_file.name), name=name) - write_yaml(path / f'{name}.yaml', metadata_yaml) + write_dataset_yaml(path / f'{name}.yaml', metadata_yaml) return RepertoireDataset(repertoires=reps, metadata_file=metadata_file, name=name, labels=labels_unique, dataset_file=path / f'{name}.yaml', identifier=identifier) diff --git a/immuneML/workflows/instructions/ligo_simulation/LigoSimInstruction.py b/immuneML/workflows/instructions/ligo_simulation/LigoSimInstruction.py index e55457d27..cf28d470d 100644 --- a/immuneML/workflows/instructions/ligo_simulation/LigoSimInstruction.py +++ b/immuneML/workflows/instructions/ligo_simulation/LigoSimInstruction.py @@ -19,9 +19,8 @@ from immuneML.IO.dataset_export.AIRRExporter import AIRRExporter from immuneML.app.LigoApp import SimError -from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet from immuneML.data_model.SequenceSet import Repertoire -from immuneML.data_model.bnp_util import bnp_write_to_file, write_yaml +from immuneML.data_model.bnp_util import bnp_write_to_file, write_dataset_yaml from immuneML.data_model.datasets.ElementDataset import ReceptorDataset from immuneML.data_model.datasets.ElementDataset import SequenceDataset from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset @@ -189,16 +188,21 @@ def _simulate_receptor_dataset(self, labels: dict): type_dict = {k: v for k, v, default_value in self.custom_fields} - write_yaml(metadata_filename, { - 'labels': labels, - 'type_dict_dynamic_fields': {key: AIRRSequenceSet.TYPE_TO_STR[val] for key, val in type_dict.items()}, - 'filename': data_filename - }) + dataset_name = 'simulated_dataset' dataset_cls = ReceptorDataset if self.state.simulation.paired else SequenceDataset - self.state.resulting_dataset = dataset_cls.build(data_filename, metadata_filename=metadata_filename, - name='simulated_dataset', labels=labels) + metadata_yaml = dataset_cls.create_metadata_dict(dataset_class=dataset_cls, + filename=data_filename, + type_dict=type_dict, + name=dataset_name, + labels=labels) # todo: identifier not explicitly passed on to dataset + + write_dataset_yaml(metadata_filename, metadata_yaml) + + self.state.resulting_dataset = dataset_cls.build(data_filename, + metadata_filename=metadata_filename, + name=dataset_name, labels=labels) def _parse_example_output(self, result) -> dict: if self.state.simulation.is_repertoire: diff --git a/immuneML/workflows/instructions/train_gen_model/TrainGenModelInstruction.py b/immuneML/workflows/instructions/train_gen_model/TrainGenModelInstruction.py index 4bafb490f..75fdf6247 100644 --- a/immuneML/workflows/instructions/train_gen_model/TrainGenModelInstruction.py +++ b/immuneML/workflows/instructions/train_gen_model/TrainGenModelInstruction.py @@ -3,14 +3,11 @@ from dataclasses import field, dataclass from pathlib import Path from typing import Dict, List -from uuid import uuid4 import numpy as np from immuneML.IO.dataset_export.AIRRExporter import AIRRExporter -from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet -from immuneML.data_model.bnp_util import merge_dataclass_objects, bnp_write_to_file, get_type_dict_from_bnp_object, \ - write_yaml +from immuneML.data_model.bnp_util import merge_dataclass_objects, bnp_write_to_file, write_dataset_yaml from immuneML.data_model.datasets.Dataset import Dataset from immuneML.data_model.datasets.ElementDataset import SequenceDataset from immuneML.hyperparameter_optimization.config.SplitType import SplitType @@ -164,7 +161,7 @@ def _make_combined_dataset(self): name=f'combined_{self.state.name}_dataset', labels={'gen_model_name': [self.method.name, ''], "from_gen_model": [True, False]}) - write_yaml(path / f'combined_{self.state.name}_dataset.yaml', metadata_yaml) + write_dataset_yaml(path / f'combined_{self.state.name}_dataset.yaml', metadata_yaml) self.state.combined_dataset = SequenceDataset.build( metadata_filename=path / f'combined_{self.state.name}_dataset.yaml', diff --git a/test/IO/dataset_import/test_AIRRImport.py b/test/IO/dataset_import/test_AIRRImport.py index d27c214f3..31f5231de 100644 --- a/test/IO/dataset_import/test_AIRRImport.py +++ b/test/IO/dataset_import/test_AIRRImport.py @@ -141,6 +141,15 @@ def test_import_exported_dataset(self): self.assertListEqual([getattr(sequence, attribute) for sequence in dataset1.repertoires[0].sequences()], [getattr(sequence, attribute) for sequence in dataset2.repertoires[0].sequences()]) + d3_params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets/", "AIRR") + d3_params["dataset_file"] = path / "imported" / "airr_repertoire_dataset1.yaml" + d3_params["result_path"] = path / "imported_from_dataset_file" + dataset3 = AIRRImport(d3_params, "airr_repertoire_dataset3").import_dataset() + + for attribute in ["sequence_aa", "sequence", "v_call", "j_call", "locus", "metadata", "vj_in_frame"]: + self.assertListEqual([getattr(sequence, attribute) for sequence in dataset1.repertoires[0].sequences()], + [getattr(sequence, attribute) for sequence in dataset3.repertoires[0].sequences()]) + shutil.rmtree(path) def test_minimal_dataset(self): diff --git a/test/ml_methods/test_simple_lstm.py b/test/ml_methods/test_simple_lstm.py index 035a7ddde..45b13c003 100644 --- a/test/ml_methods/test_simple_lstm.py +++ b/test/ml_methods/test_simple_lstm.py @@ -31,3 +31,4 @@ def test_simple_lstm(): assert all(sequence_df['cdr3_aa'].str.len() >= 1) shutil.rmtree(path) +