Skip to content

Commit

Permalink
add total count to MatchedSequencesEncoder - add #184
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlovicmilena committed Jan 15, 2025
1 parent 62d03a5 commit 8705e5a
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ max_edit_distance: 2 # max Levenshtein distance
sum_matches: False
reads: all
normalize: False
output_count_as_feature: False
40 changes: 32 additions & 8 deletions immuneML/encodings/reference_encoding/MatchedSequencesEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class MatchedSequencesEncoder(DatasetEncoder):
- normalize (bool): If True, the sequence matches are divided by the total number of unique sequences in the
repertoire (when reads = unique) or the total number of reads in the repertoire (when reads = all).
- output_count_as_feature: if True, the encoded repertoire is represented by the matches, and by the total number
of sequences (or reads) in the repertoire, as defined by reads parameter above; by default this is False
**YAML specification:**
Expand All @@ -73,26 +76,33 @@ class MatchedSequencesEncoder(DatasetEncoder):
max_edit_distance: 1
"""

def __init__(self, max_edit_distance: int, reference: List[ReceptorSequence], reads: ReadsType, sum_matches: bool, normalize: bool,
name: str = None):
def __init__(self, max_edit_distance: int, reference: List[ReceptorSequence], reads: ReadsType, sum_matches: bool,
normalize: bool, output_count_as_feature: bool = False, name: str = None):
super().__init__(name=name)
self.max_edit_distance = max_edit_distance
self.reference_sequences = reference
self.reads = reads
self.sum_matches = sum_matches
self.normalize = normalize
self.feature_count = 1 if self.sum_matches else len(self.reference_sequences)
self.output_count_as_feature = output_count_as_feature
self.feature_count = (1 if self.sum_matches else len(self.reference_sequences)) + int(output_count_as_feature)

@staticmethod
def _prepare_parameters(max_edit_distance: int, reference: dict, reads: str, sum_matches: bool, normalize: bool,
name: str = None):
output_count_as_feature: bool = False, name: str = None):
location = "MatchedSequencesEncoder"

ParameterValidator.assert_type_and_value(max_edit_distance, int, location, "max_edit_distance", min_inclusive=0)
ParameterValidator.assert_type_and_value(sum_matches, bool, location, "sum_matches")
ParameterValidator.assert_type_and_value(normalize, bool, location, "normalize")
ParameterValidator.assert_type_and_value(output_count_as_feature, bool, location, "normalize")
ParameterValidator.assert_type_and_value(normalize, bool, location, "output_count_as_feature")
ParameterValidator.assert_in_valid_list(reads.upper(), [item.name for item in ReadsType], location, "reads")

if output_count_as_feature and normalize:
raise RuntimeError(f"{MatchedSequencesEncoder.__name__}: normalize and output_count_as_feature cannot \n"
f"be both set to True at the same time. The sequence count (or reads count) can either \n"
f"be used for normalization or included as a separate output.")

reference_sequences = MatchedReferenceUtil.prepare_reference(reference_params=reference, location=location, paired=False)

return {
Expand All @@ -101,6 +111,7 @@ def _prepare_parameters(max_edit_distance: int, reference: dict, reads: str, sum
"reads": ReadsType[reads.upper()],
"sum_matches": sum_matches,
"normalize": normalize,
"output_count_as_feature": output_count_as_feature,
"name": name
}

Expand All @@ -127,7 +138,8 @@ def _prepare_caching_params(self, dataset, params: EncoderParams):
for seq in self.reference_sequences]),
"reads": self.reads.name,
"sum_matches": self.sum_matches,
"normalize": self.normalize}
"normalize": self.normalize,
"output_count_as_feature": self.output_count_as_feature}

return (("dataset_identifiers", tuple(dataset.get_example_ids())),
("dataset_metadata", dataset.metadata_file),
Expand All @@ -143,7 +155,12 @@ def _encode_new_dataset(self, dataset, params: EncoderParams):
encoded_repertoires = self._normalize(dataset, encoded_repertoires) if self.normalize else encoded_repertoires

feature_annotations = None if self.sum_matches else self._get_feature_info()
feature_names = [f"sum_of_{self.reads.value}_reads"] if self.sum_matches else list(feature_annotations["sequence_desc"])
if self.sum_matches:
feature_names = [f"sum_of_{self.reads.value}_reads"]
if self.output_count_as_feature:
feature_names += ['sequence_count_in_repertoire']
else:
feature_names = list(feature_annotations["sequence_desc"])

encoded_dataset = dataset.clone()
encoded_dataset.encoded_data = EncodedData(
Expand Down Expand Up @@ -177,7 +194,7 @@ def _get_feature_info(self):
- j call
"""

features = [[] for i in range(0, self.feature_count)]
features = [[] for i in range(0, len(self.reference_sequences))]

for i, sequence in enumerate(self.reference_sequences):
features[i] = [sequence.sequence_id,
Expand All @@ -187,6 +204,9 @@ def _get_feature_info(self):
sequence.j_call,
self._get_sequence_desc(sequence)]

if self.output_count_as_feature:
features += [['', '', '', '', '', 'sequence_count_in_repertoire']]

features = pd.DataFrame(features, columns=["sequence_id", "locus", "sequence", "v_call", "j_call", "sequence_desc"])
if features['sequence_desc'].unique().shape[0] < features.shape[0]:
features.loc[:, 'sequence_desc'] = [row['sequence_desc'] + "_" + row['sequence_id'] for ind, row in features.iterrows()]
Expand Down Expand Up @@ -246,4 +266,8 @@ def _compute_matches_to_reference(self, repertoire: Repertoire):
match_count = 1 if self.reads == ReadsType.UNIQUE else repertoire_seq.duplicate_count
matches[matches_idx] += match_count

if self.output_count_as_feature:
duplicate_counts = repertoire.data.duplicate_count
matches[-1] = sum(duplicate_counts) if self.reads == ReadsType.ALL else len(duplicate_counts)

return matches
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from unittest import TestCase

from immuneML.caching.CacheType import CacheType
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceParams import Chain, RegionType
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.reference_encoding.MatchedSequencesEncoder import MatchedSequencesEncoder
Expand Down Expand Up @@ -67,7 +66,8 @@ def test__encode_new_dataset(self):
"max_edit_distance": 0,
"reads": reads,
"sum_matches": False,
"normalize": normalize
"normalize": normalize,
"output_count_as_feature": False
})

encoded = encoder.encode(dataset, EncoderParams(
Expand Down
4 changes: 2 additions & 2 deletions test/reports/encoding_reports/test_Matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pandas as pd

from immuneML.caching.CacheType import CacheType
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceParams import Chain
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.reference_encoding.MatchedReceptorsEncoder import MatchedReceptorsEncoder
Expand Down Expand Up @@ -145,7 +144,8 @@ def create_encoded_matchedsequences(self, path):
"max_edit_distance": 0,
"reads": "all",
"sum_matches": False,
"normalize": False
"normalize": False,
"output_count_as_feature": False
})

encoded = encoder.encode(dataset, EncoderParams(
Expand Down

0 comments on commit 8705e5a

Please sign in to comment.