Merge pull request #2404 from moj-analytical-services/better_docstrings

Better docstrings
moj-analytical-services · Sep 16, 2024 · 52059b5 · 52059b5
2 parents 775590f + 2d24505
commit 52059b5
Show file tree

Hide file tree

Showing 7 changed files with 49 additions and 30 deletions.
diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
@@ -56,8 +56,14 @@ def cluster_pairwise_predictions_at_threshold(
             SplinkDataFrame: A SplinkDataFrame containing a list of all IDs, clustered
                 into groups based on the desired match threshold.
 
+        Examples:
+            ```python
+            df_predict = linker.inference.predict(threshold_match_probability=0.5)
+            df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
+                df_predict, threshold_match_probability=0.95
+            )
+            ```
         """
-
         # Feeding in df_predict forces materiailisation, if it exists in your database
         pipeline = CTEPipeline()
         nodes_with_tf = compute_df_concat_with_tf(self._linker, pipeline)
@@ -248,6 +254,20 @@ def compute_graph_metrics(
                 attribute "edges" for edge metrics table
                 attribute "clusters" for cluster metrics table
 
+        Examples:
+            ```python
+            df_predict = linker.inference.predict(threshold_match_probability=0.5)
+            df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
+                df_predict, threshold_match_probability=0.95
+            )
+            graph_metrics = linker.clustering.compute_graph_metrics(
+                df_predict, df_clustered, threshold_match_probability=0.95
+            )
+
+            node_metrics = graph_metrics.nodes.as_pandas_dataframe()
+            edge_metrics = graph_metrics.edges.as_pandas_dataframe()
+            cluster_metrics = graph_metrics.clusters.as_pandas_dataframe()
+            ```
         """
         if threshold_match_probability is None:
             threshold_match_probability = df_clustered.metadata.get(

diff --git a/splink/internals/linker_components/evaluation.py b/splink/internals/linker_components/evaluation.py
@@ -135,6 +135,7 @@ def accuracy_analysis_from_labels_column(
                 (emphasis on recall) and \u03b2=0.5 (emphasis on precision)
                 - `"p4"` -  an extended F1 score with specificity and NPV included
                 - `"phi"` - \u03c6 coefficient or Matthews correlation coefficient (MCC)
+
         Examples:
             ```py
             linker.evaluation.accuracy_analysis_from_labels_column("ground_truth", add_metrics=["f1"])
@@ -244,13 +245,14 @@ def accuracy_analysis_from_labels_table(
                 (emphasis on recall) and \u03b2=0.5 (emphasis on precision)
                 - `"p4"` -  an extended F1 score with specificity and NPV included
                 - `"phi"` - \u03c6 coefficient or Matthews correlation coefficient (MCC)
+
+        Returns:
+            altair.Chart: An altair chart
+
         Examples:
             ```py
             linker.accuracy_analysis_from_labels_table("ground_truth", add_metrics=["f1"])
             ```
-
-        Returns:
-            altair.Chart: An altair chart
         """  # noqa: E501
 
         allowed = ["specificity", "npv", "accuracy", "f1", "f2", "f0_5", "p4", "phi"]
@@ -313,6 +315,9 @@ def prediction_errors_from_labels_column(
             threshold_match_probability (float, optional): Threshold above which a score
                 is considered to be a match. Defaults to 0.5.
 
+        Returns:
+            SplinkDataFrame:  Table containing false positives and negatives
+
         Examples:
             ```py
             linker.evaluation.prediction_errors_from_labels_column(
@@ -321,9 +326,6 @@ def prediction_errors_from_labels_column(
                 include_false_positives=False
             ).as_pandas_dataframe()
             ```
-
-        Returns:
-            SplinkDataFrame:  Table containing false positives and negatives
         """
         return prediction_errors_from_label_column(
             self._linker,
@@ -352,16 +354,15 @@ def unlinkables_chart(
                 the title of the output chart.
             as_dict (bool, optional): If True, return a dict version of the chart.
 
+        Returns:
+            altair.Chart: An altair chart
+
         Examples:
             After estimating the parameters of the model, run:
 
             ```py
             linker.evaluation.unlinkables_chart()
             ```
-
-
-        Returns:
-            altair.Chart: An altair chart
         """
 
         # Link our initial df on itself and calculate the % of unlinkable entries

diff --git a/splink/internals/linker_components/inference.py b/splink/internals/linker_components/inference.py
@@ -62,6 +62,10 @@ def deterministic_link(self) -> SplinkDataFrame:
         Deterministic linkage, however, is likely to result in missed links
         (false negatives).
 
+        Returns:
+            SplinkDataFrame: A SplinkDataFrame of the pairwise comparisons.
+
+
         Examples:
 
             ```py
@@ -76,10 +80,6 @@ def deterministic_link(self) -> SplinkDataFrame:
             linker = Linker(df, settings, db_api=db_api)
             splink_df = linker.inference.deterministic_link()
             ```
-
-
-        Returns:
-            SplinkDataFrame: A SplinkDataFrame of the pairwise comparisons.
         """
         pipeline = CTEPipeline()
         # Allows clustering during a deterministic linkage.

diff --git a/splink/internals/linker_components/misc.py b/splink/internals/linker_components/misc.py
@@ -59,7 +59,7 @@ def query_sql(self, sql, output_type="pandas"):
         Examples:
             ```py
             linker = Linker(df, settings, db_api)
-            df_predict = linker.predict()
+            df_predict = linker.inference.predict()
             linker.misc.query_sql(f"select * from {df_predict.physical_name} limit 10")
             ```
 

diff --git a/splink/internals/linker_components/table_management.py b/splink/internals/linker_components/table_management.py
@@ -55,7 +55,10 @@ def compute_tf_table(self, column_name: str) -> SplinkDataFrame:
             >>>
             # On subsequent data linking job, read this table rather than recompute
             df_first_name_tf = pd.read_parquet("folder/first_name_tf")
-            df_first_name_tf.createOrReplaceTempView("__splink__df_tf_first_name")
+            linker.table_management.register_term_frequency_lookup(
+                df_first_name_tf, "first_name"
+            )
+
             ```
 
 
@@ -207,8 +210,10 @@ def register_term_frequency_lookup(self, input_data, col_name, overwrite=False):
                 {"first_name": "alfie", "tf_first_name": 0.013},
             ]
             tf_df = pd.DataFrame(tf_table)
-            linker.table_management.register_term_frequency_lookup(tf_df,
-                                                                    "first_name")
+            linker.table_management.register_term_frequency_lookup(
+                tf_df,
+                "first_name"
+            )
             ```
         """
 

diff --git a/splink/internals/linker_components/training.py b/splink/internals/linker_components/training.py
@@ -247,14 +247,7 @@ def estimate_parameters_using_expectation_maximisation(
         [this PR](https://github.com/moj-analytical-services/splink/pull/734) for
         the rationale.
 
-        Examples:
-            Default behaviour
-            ```py
-            br_training = block_on("first_name", "dob")
-            linker.training.estimate_parameters_using_expectation_maximisation(
-                br_training
-            )
-            ```
+
 
         Args:
             blocking_rule (BlockingRuleCreator | str): The blocking rule used to
@@ -276,9 +269,9 @@ def estimate_parameters_using_expectation_maximisation(
 
         Examples:
             ```py
-            blocking_rule = block_on("first_name", "surname")
+            br_training = block_on("first_name", "dob")
             linker.training.estimate_parameters_using_expectation_maximisation(
-                blocking_rule
+                br_training
             )
             ```
 

diff --git a/splink/internals/linker_components/visualisations.py b/splink/internals/linker_components/visualisations.py
@@ -212,7 +212,7 @@ def tf_adjustment_chart(
                 this or `n_most_freq` set to None, all values will be shown.
                 Default to 10.
             vals_to_include (list, optional): Specific values for which to show term
-                sfrequency adjustments.
+                frequency adjustments.
                 Defaults to None.
             as_dict (bool, optional): If True, return the chart as a dictionary.