Skip to content

Commit

Permalink
Merge pull request #2404 from moj-analytical-services/better_docstrings
Browse files Browse the repository at this point in the history
Better docstrings
  • Loading branch information
RobinL authored Sep 16, 2024
2 parents 775590f + 2d24505 commit 52059b5
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 30 deletions.
22 changes: 21 additions & 1 deletion splink/internals/linker_components/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,14 @@ def cluster_pairwise_predictions_at_threshold(
SplinkDataFrame: A SplinkDataFrame containing a list of all IDs, clustered
into groups based on the desired match threshold.
Examples:
```python
df_predict = linker.inference.predict(threshold_match_probability=0.5)
df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
df_predict, threshold_match_probability=0.95
)
```
"""

# Feeding in df_predict forces materiailisation, if it exists in your database
pipeline = CTEPipeline()
nodes_with_tf = compute_df_concat_with_tf(self._linker, pipeline)
Expand Down Expand Up @@ -248,6 +254,20 @@ def compute_graph_metrics(
attribute "edges" for edge metrics table
attribute "clusters" for cluster metrics table
Examples:
```python
df_predict = linker.inference.predict(threshold_match_probability=0.5)
df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
df_predict, threshold_match_probability=0.95
)
graph_metrics = linker.clustering.compute_graph_metrics(
df_predict, df_clustered, threshold_match_probability=0.95
)
node_metrics = graph_metrics.nodes.as_pandas_dataframe()
edge_metrics = graph_metrics.edges.as_pandas_dataframe()
cluster_metrics = graph_metrics.clusters.as_pandas_dataframe()
```
"""
if threshold_match_probability is None:
threshold_match_probability = df_clustered.metadata.get(
Expand Down
21 changes: 11 additions & 10 deletions splink/internals/linker_components/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def accuracy_analysis_from_labels_column(
(emphasis on recall) and \u03b2=0.5 (emphasis on precision)
- `"p4"` - an extended F1 score with specificity and NPV included
- `"phi"` - \u03c6 coefficient or Matthews correlation coefficient (MCC)
Examples:
```py
linker.evaluation.accuracy_analysis_from_labels_column("ground_truth", add_metrics=["f1"])
Expand Down Expand Up @@ -244,13 +245,14 @@ def accuracy_analysis_from_labels_table(
(emphasis on recall) and \u03b2=0.5 (emphasis on precision)
- `"p4"` - an extended F1 score with specificity and NPV included
- `"phi"` - \u03c6 coefficient or Matthews correlation coefficient (MCC)
Returns:
altair.Chart: An altair chart
Examples:
```py
linker.accuracy_analysis_from_labels_table("ground_truth", add_metrics=["f1"])
```
Returns:
altair.Chart: An altair chart
""" # noqa: E501

allowed = ["specificity", "npv", "accuracy", "f1", "f2", "f0_5", "p4", "phi"]
Expand Down Expand Up @@ -313,6 +315,9 @@ def prediction_errors_from_labels_column(
threshold_match_probability (float, optional): Threshold above which a score
is considered to be a match. Defaults to 0.5.
Returns:
SplinkDataFrame: Table containing false positives and negatives
Examples:
```py
linker.evaluation.prediction_errors_from_labels_column(
Expand All @@ -321,9 +326,6 @@ def prediction_errors_from_labels_column(
include_false_positives=False
).as_pandas_dataframe()
```
Returns:
SplinkDataFrame: Table containing false positives and negatives
"""
return prediction_errors_from_label_column(
self._linker,
Expand Down Expand Up @@ -352,16 +354,15 @@ def unlinkables_chart(
the title of the output chart.
as_dict (bool, optional): If True, return a dict version of the chart.
Returns:
altair.Chart: An altair chart
Examples:
After estimating the parameters of the model, run:
```py
linker.evaluation.unlinkables_chart()
```
Returns:
altair.Chart: An altair chart
"""

# Link our initial df on itself and calculate the % of unlinkable entries
Expand Down
8 changes: 4 additions & 4 deletions splink/internals/linker_components/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ def deterministic_link(self) -> SplinkDataFrame:
Deterministic linkage, however, is likely to result in missed links
(false negatives).
Returns:
SplinkDataFrame: A SplinkDataFrame of the pairwise comparisons.
Examples:
```py
Expand All @@ -76,10 +80,6 @@ def deterministic_link(self) -> SplinkDataFrame:
linker = Linker(df, settings, db_api=db_api)
splink_df = linker.inference.deterministic_link()
```
Returns:
SplinkDataFrame: A SplinkDataFrame of the pairwise comparisons.
"""
pipeline = CTEPipeline()
# Allows clustering during a deterministic linkage.
Expand Down
2 changes: 1 addition & 1 deletion splink/internals/linker_components/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def query_sql(self, sql, output_type="pandas"):
Examples:
```py
linker = Linker(df, settings, db_api)
df_predict = linker.predict()
df_predict = linker.inference.predict()
linker.misc.query_sql(f"select * from {df_predict.physical_name} limit 10")
```
Expand Down
11 changes: 8 additions & 3 deletions splink/internals/linker_components/table_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def compute_tf_table(self, column_name: str) -> SplinkDataFrame:
>>>
# On subsequent data linking job, read this table rather than recompute
df_first_name_tf = pd.read_parquet("folder/first_name_tf")
df_first_name_tf.createOrReplaceTempView("__splink__df_tf_first_name")
linker.table_management.register_term_frequency_lookup(
df_first_name_tf, "first_name"
)
```
Expand Down Expand Up @@ -207,8 +210,10 @@ def register_term_frequency_lookup(self, input_data, col_name, overwrite=False):
{"first_name": "alfie", "tf_first_name": 0.013},
]
tf_df = pd.DataFrame(tf_table)
linker.table_management.register_term_frequency_lookup(tf_df,
"first_name")
linker.table_management.register_term_frequency_lookup(
tf_df,
"first_name"
)
```
"""

Expand Down
13 changes: 3 additions & 10 deletions splink/internals/linker_components/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,14 +247,7 @@ def estimate_parameters_using_expectation_maximisation(
[this PR](https://github.com/moj-analytical-services/splink/pull/734) for
the rationale.
Examples:
Default behaviour
```py
br_training = block_on("first_name", "dob")
linker.training.estimate_parameters_using_expectation_maximisation(
br_training
)
```
Args:
blocking_rule (BlockingRuleCreator | str): The blocking rule used to
Expand All @@ -276,9 +269,9 @@ def estimate_parameters_using_expectation_maximisation(
Examples:
```py
blocking_rule = block_on("first_name", "surname")
br_training = block_on("first_name", "dob")
linker.training.estimate_parameters_using_expectation_maximisation(
blocking_rule
br_training
)
```
Expand Down
2 changes: 1 addition & 1 deletion splink/internals/linker_components/visualisations.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def tf_adjustment_chart(
this or `n_most_freq` set to None, all values will be shown.
Default to 10.
vals_to_include (list, optional): Specific values for which to show term
sfrequency adjustments.
frequency adjustments.
Defaults to None.
as_dict (bool, optional): If True, return the chart as a dictionary.
Expand Down

0 comments on commit 52059b5

Please sign in to comment.