Skip to content

Commit

Permalink
[#162] Add hlink notice to the top of new files, add logging to Renam…
Browse files Browse the repository at this point in the history
…eVectorAttributes
  • Loading branch information
riley-harper committed Nov 20, 2024
1 parent b2dfa4e commit 444c6a7
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 2 deletions.
18 changes: 16 additions & 2 deletions hlink/linking/transformers/rename_vector_attributes.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
# This file is part of the ISRDI's hlink.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

import logging

from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, Param, Params, TypeConverters
from pyspark.sql import DataFrame

logger = logging.getLogger(__name__)


class RenameVectorAttributes(Transformer, HasInputCol):
"""
A custom transformer which renames the attributes or "slot names" of a
given input column of type vector. This is helpful when you don't have
complete control over the names of the attributes, but you need them to
look a certain way.
complete control over the names of the attributes when they are created,
but you still need them to look a certain way.
For example, LightGBM can't handle vector attributes with colons in their
names. But the Spark Interaction class creates vector attributes named with
Expand Down Expand Up @@ -61,6 +70,11 @@ def _transform(self, dataset: DataFrame) -> DataFrame:
metadata = dataset.schema[input_col].metadata
attributes_by_type = metadata["ml_attr"]["attrs"]

logger.debug(
f"Renaming the attributes of vector column '{input_col}': "
f"replacing {to_replace} with '{replacement_str}'"
)

# The attributes are grouped by type, which may be numeric, binary, or
# nominal. We don't care about the type here; we'll just rename all of
# the attributes.
Expand Down
5 changes: 5 additions & 0 deletions hlink/tests/core/classifier_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# This file is part of the ISRDI's hlink.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

from hlink.linking.core.classifier import choose_classifier
from hlink.tests.markers import requires_lightgbm

Expand Down
5 changes: 5 additions & 0 deletions hlink/tests/markers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# This file is part of the ISRDI's hlink.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

import pytest

try:
Expand Down
5 changes: 5 additions & 0 deletions hlink/tests/transformers_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# This file is part of the ISRDI's hlink.
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

Expand Down

0 comments on commit 444c6a7

Please sign in to comment.