diff --git a/dedupe/api.py b/dedupe/api.py index 71edb4c5b..b33b0a1ba 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -9,7 +9,6 @@ import itertools import logging import multiprocessing -import os import pickle import sqlite3 import tempfile @@ -98,12 +97,27 @@ class IntegralMatching(Matching): def score(self, pairs: RecordPairs) -> Scores: """ - Scores pairs of records. Returns pairs of tuples of records id and - associated probabilities that the pair of records are match + Scores pairs of records. Returns a numpy structured array of scores. Args: - pairs: Iterator of pairs of records - + pairs: Iterator of pairs of records, such as from the output of :func:`pairs` + + Returns: + A numpy + `structured array `_ + with a with a dtype of `[('pairs', id_type, 2), ('score', 'f4')]` + where dtype is either a str or int, + and score is a 32-bit float in the range (0, 1]. + The 'pairs' column contains pairs of ids of + the records compared and the 'score' column contains + the similarity score for that pair of records. + + This array will be a numpy.array when self.num_cores is 1, + and a numpy.memmap when self.num_cores is greater than 1. + This memmap will automatically clean itself up, you don't + have to worry about it. + + For each pair, the smaller id will be first. """ try: matches = core.scoreDuplicates( @@ -175,7 +189,6 @@ def partition( clusters = self.cluster(pair_scores, threshold) clusters = self._add_singletons(data, clusters) clusters = list(clusters) - _cleanup_scores(pair_scores) return clusters def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters: @@ -514,7 +527,6 @@ def join( links = pair_scores[pair_scores["score"] > threshold] links = list(links) - _cleanup_scores(pair_scores) return links def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links: @@ -805,6 +817,8 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]: Args: blocks: Iterator of blocks of records + Yields: + Structured numpy arrays. See :meth:`dedupe.Dedupe.score` for more info. """ matches = core.scoreGazette( @@ -946,7 +960,7 @@ def __init__( Args: settings_file: A file object containing settings info produced from the - :func:`~dedupe.api.ActiveMatching.write_settings` method. + :meth:`dedupe.Dedupe.write_settings` method. num_cores: The number of cpus to use for parallel processing, defaults to the number of cpus @@ -1468,14 +1482,3 @@ def flatten_training( y.extend([encoded_y] * len(pairs)) return examples, numpy.array(y) - - -def _cleanup_scores(arr: Scores) -> None: - try: - mmap_file = arr.filename # type: ignore - except AttributeError: - pass - else: - del arr - if mmap_file: - os.remove(mmap_file) diff --git a/dedupe/core.py b/dedupe/core.py index f89a33f59..2923be93b 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -10,6 +10,7 @@ import os import queue import tempfile +import weakref from typing import TYPE_CHECKING, overload import numpy @@ -176,9 +177,29 @@ def scoreDuplicates( else: scored_pairs = numpy.array([], dtype=dtype) + # Monkeypatch in these extra methods and attributes. + # See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods + scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs) # type: ignore[union-attr] + scored_pairs.removed = property(_is_removed) # type: ignore[union-attr] + return scored_pairs +def _cleanup_scores(arr: Scores) -> None: + try: + mmap_file = arr.filename # type: ignore + except AttributeError: + pass + else: + del arr + if mmap_file: + os.remove(mmap_file) + + +def _is_removed(self): + return not self.remove.alive + + def fillQueue( queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000 ) -> None: diff --git a/pyproject.toml b/pyproject.toml index 6d4674602..964cf56c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,10 +4,12 @@ requires = ["setuptools", "cython"] [tool.mypy] +python_version = "3.10" plugins = "numpy.typing.mypy_plugin" ignore_missing_imports = true files = "dedupe" check_untyped_defs = true +show_error_codes = true [tool.pytest.ini_options] minversion = "7.1"