Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IDNotExistsError when trying to use NegativeGeneratorHypergeom #504

Open
kmanpearl opened this issue Feb 3, 2025 · 0 comments
Open

IDNotExistsError when trying to use NegativeGeneratorHypergeom #504

kmanpearl opened this issue Feb 3, 2025 · 0 comments

Comments

@kmanpearl
Copy link

code:

from obnb.data import GOBP, BioGRID
from obnb.dataset import Dataset as OBNBDataset
from obnb.label.filters import Compose, EntityExistenceFilter, LabelsetRangeFilterSize, LabelsetRangeFilterSplit, NegativeGeneratorHypergeom
from obnb.label.split import RatioPartition
from obnb.util.converter import GenePropertyConverter

function_labels = GOBP(root='data')
network = BioGRID(root='data')
pubmedcnt_converter = GenePropertyConverter(root='data', name="PubMedCount")
sb_splitter = RatioPartition(0.6, 0.2, 0.2, ascending=False,property_converter=pubmedcnt_converter)
negatives_p_thresh = 0.05

def make_dataset(graph, labels, splitter):
    labels.iapply(
                Compose(
                    # Only use genes that are present in the network
                    EntityExistenceFilter(list(graph.node_ids)),
                    # Remove any labelsets with less than 15 network genes
                    LabelsetRangeFilterSize(min_val=15),
                    # Selective negatives using hyper-geom test
                    NegativeGeneratorHypergeom(p_thresh=negatives_p_thresh),
                    # Make sure each split has at least 5 positive examples
                    LabelsetRangeFilterSplit(min_val=5, splitter=splitter),
                ),
    )
    return OBNBDataset(
        graph=graph,
        feature=graph.to_dense_graph().to_feature(),
        transform='Node2Vec',
        label=labels,
        splitter=splitter,
        resolve=True)

gobp_sb = make_dataset(graph=network, labels=function_labels, splitter=sb_splitter)

error:

---------------------------------------------------------------------------
IDNotExistError                           Traceback (most recent call last)
Cell In[8], line 1
----> 1 gobp_sb = make_dataset(graph=network, labels=function_labels, splitter=sb_splitter)

Cell In[7], line 8, in make_dataset(graph, labels, splitter)
      7 def make_dataset(graph, labels, splitter):
----> 8     labels.iapply(
      9                 Compose(
     10                     # Only use genes that are present in the network
     11                     EntityExistenceFilter(list(graph.node_ids)),
     12                     # Remove any labelsets with less than 15 network genes
     13                     LabelsetRangeFilterSize(min_val=15),
     14                     # Selective negatives using hyper-geom test
     15                     NegativeGeneratorHypergeom(p_thresh=negatives_p_thresh),
     16                     # Make sure each split has at least 5 positive examples
     17                     LabelsetRangeFilterSplit(min_val=5, splitter=splitter),
     18                 ),
     19     )
     20     return OBNBDataset(
     21         graph=graph,
     22         feature=graph.to_dense_graph().to_feature(),
   (...)
     25         splitter=splitter,
     26         resolve=True)

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/collection.py:492, in LabelsetCollection.iapply(self, filter_func, progress_bar)
    486 def iapply(self, filter_func, progress_bar: bool = False):
    487     """Apply filter to labelsets inplace.
    488 
    489     This is a shortcut for calling self.apply(filter_func, inplace=True).
    490 
    491     """
--> 492     self.apply(filter_func, inplace=True, progress_bar=progress_bar)

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/collection.py:483, in LabelsetCollection.apply(self, filter_func, inplace, progress_bar)
    481 checkers.checkType("inplace", bool, inplace)
    482 obj = self if inplace else self.copy()
--> 483 filter_func(obj, progress_bar)
    484 return obj

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/base.py:113, in Compose.__call__(self, lsc, progress_bar)
    111 def __call__(self, lsc, progress_bar):
    112     for filter_ in self.filters:
--> 113         filter_.__call__(lsc, progress_bar)
    114         self.logger.info(lsc.stats())

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/base.py:81, in BaseFilter.__call__(self, lsc, progress_bar)
     79 pbar = tqdm(entity_ids, desc=f"{self!r}", disable=not progress_bar)
     80 for entity_id in pbar:
---> 81     if self.criterion(val_getter(entity_id)):
     82         mod_fun(entity_id)
     83         self.logger.debug(
     84             f"Modification ({self.mod_name}) criterion met for "
     85             f"{entity_id!r}",
     86         )

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/label/filters/range_filter.py:166, in LabelsetRangeFilterSplit.get_val_getter.<locals>.val_getter(label_id)
    164 def val_getter(label_id):
    165     y_all, masks = lsc.split(self.splitter, **self.kwargs)
--> 166     neg_idx = lsc.entity[lsc.get_negative(label_id)]
    167     self.logger.debug(f"{label_id = } {neg_idx = }")
    168     # TODO: make label_ids to index mapping?

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:87, in IDlst.__getitem__(self, identifier)
     85     return self._getitem_sinlge(identifier)
     86 elif isinstance(identifier, Iterable):
---> 87     return self._getitem_multiple(identifier)
     88 else:
     89     raise TypeError(
     90         f"ID key(s) must be string or iterables of string, "
     91         f"not {type(identifier)!r}",
     92     )

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:102, in IDlst._getitem_multiple(self, identifiers)
    100 idx_lst = []
    101 for identifier in identifiers:
--> 102     idx_lst.append(self._getitem_sinlge(identifier))
    103 return np.array(idx_lst)

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:202, in IDmap._getitem_sinlge(self, identifier)
    201 def _getitem_sinlge(self, identifier):
--> 202     self._check_ID_existence(identifier, True)
    203     return self._map[identifier]

File ~/miniconda3/envs/study_bias/lib/python3.12/site-packages/obnb/util/idhandler.py:111, in IDlst._check_ID_existence(self, identifier, existence)
    109     raise IDExistsError(f"Existing ID {identifier!r}")
    110 elif existence & (identifier not in self):
--> 111     raise IDNotExistError(f"Unknown ID {identifier!r}")

IDNotExistError: Unknown ID '5557'

I have test multiple versions of the above function and without NegativeGeneratorHypergeom it works fine but anytime this is included it gives the error, even if it is the only filtering performed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant