🖤

ipums · Jun 26, 2024 · 059f172 · 059f172
1 parent c58c31c
commit 059f172
Show file tree

Hide file tree

Showing 12 changed files with 350 additions and 274 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -179,6 +179,7 @@
 copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
 copybutton_prompt_is_regexp = True
 
+
 # this is a workaround to filter out forward reference issue in
 # sphinx_autodoc_typehints
 class FilterPandasTypeAnnotationWarning(pylogging.Filter):
@@ -196,6 +197,7 @@ def filter(self, record: pylogging.LogRecord) -> bool:
     FilterPandasTypeAnnotationWarning()
 )
 
+
 # based on pandas/doc/source/conf.py
 def linkcode_resolve(domain, info):
     """Determine the URL corresponding to Python object."""

diff --git a/src/ipumspy/__init__.py b/src/ipumspy/__init__.py
@@ -1,5 +1,6 @@
 """
 Convenience imports for ipumspy
 """
+
 from .__version__ import __version__
 from .api import *
diff --git a/src/ipumspy/__version__.py b/src/ipumspy/__version__.py
@@ -1,6 +1,7 @@
 """
 Simple version wrapper
 """
+
 try:
     import importlib.metadata as metadata
 except ImportError:

diff --git a/src/ipumspy/api/__init__.py b/src/ipumspy/api/__init__.py
@@ -1,6 +1,7 @@
 """
 Convenience imports from api
 """
+
 from .core import IpumsApiClient
 from .exceptions import *
 from .extract import *
diff --git a/src/ipumspy/api/core.py b/src/ipumspy/api/core.py
@@ -1,6 +1,7 @@
 """
 Core utilities for interacting with the IPUMS API
 """
+
 import copy
 import time
 import warnings
@@ -164,7 +165,7 @@ def submit_extract(
         Returns:
             The number of the extract for the passed user account
         """
-        
+
         if not isinstance(extract, BaseExtract):
             extract = copy.deepcopy(extract)
             if "microdata" in BaseExtract._collection_type_to_extract:
@@ -537,12 +538,13 @@ def get_all_sample_info(self, collection: str) -> Dict:
             sample ids, values are sample descriptions
         """
         samples = self.get(
-            f"{self.base_url}/metadata/samples", 
+            f"{self.base_url}/metadata/samples",
             params={
-                "collection": collection, 
+                "collection": collection,
                 "pageSize": 2500,
-                "version": self.api_version}
-            ).json()
+                "version": self.api_version,
+            },
+        ).json()
         # make it into the expected dict
         samples_dict = {}
         for item in samples["data"]:

diff --git a/src/ipumspy/api/exceptions.py b/src/ipumspy/api/exceptions.py
@@ -1,6 +1,7 @@
 """
 Exceptions which can arise in calls to the IPUMS API
 """
+
 __all__ = ["IpumsApiException", "TransientIpumsApiException"]
 
 

diff --git a/src/ipumspy/api/extract.py b/src/ipumspy/api/extract.py
@@ -1,6 +1,7 @@
 """
 Wrappers for payloads to ship to the IPUMS API
 """
+
 from __future__ import annotations
 
 import warnings
@@ -27,7 +28,7 @@ class ModifiedExtractWarning(Warning):
 
 @dataclass
 class IpumsObject(ABC):
-    
+
     def update(self, attribute: str, value: Any):
         """
         Update Variable features
@@ -40,7 +41,7 @@ def update(self, attribute: str, value: Any):
             setattr(self, attribute, value)
         else:
             raise KeyError(f"{type(self).__name__} has no attribute '{attribute}'.")
-        
+
     @abstractmethod
     def build(self):
         pass
@@ -66,7 +67,6 @@ class Variable(IpumsObject):
     def __post_init__(self):
         self.name = self.name.upper()
 
-
     def build(self):
         """Format Variable information for API Extract submission"""
         built_var = self.__dict__.copy()
@@ -103,10 +103,10 @@ class TimeUseVariable(IpumsObject):
     """IPUMS Time Use Variable name"""
     owner: Optional[str] = ""
     """email address associated with your IPUMS account. Only required for user-defined Time Use Variables."""
-    
+
     def __post_init__(self):
         self.name = self.name.lower()
-    
+
     def build(self):
         """Format Time Use Variable information for API Extract submission"""
         built_tuv = self.__dict__.copy()
@@ -115,7 +115,9 @@ def build(self):
         # only include the owner field if one is specified
         if self.owner != "":
             if "@" not in self.owner:
-                raise ValueError("'owner' must be the email address associated with your IPUMS user account.")
+                raise ValueError(
+                    "'owner' must be the email address associated with your IPUMS user account."
+                )
             else:
                 built_tuv["owner"] = built_tuv.pop("owner")
         else:
@@ -228,7 +230,7 @@ def _snake_to_camel(self, kwarg_dict):
             # if the value of the kwarg is also a dict
             if isinstance(kwarg_dict[key], dict):
                 self._snake_to_camel(kwarg_dict[key])
-                    
+
             # create camelCase equivalent
             key_list = key.split("_")
             # join capitalized versions of all parts except the first
@@ -245,11 +247,15 @@ def _validate_list_args(self, list_arg, arg_obj):
         # this bit feels extra sketch, but it seems like a better solution
         # than just having the BaseExtract(**kwargs) method of instantiating
         # an extract object quietly leave out variable-level extract features
-        
-        # before diving into any duplicate validation, make sure the list argument the user provided 
+
+        # before diving into any duplicate validation, make sure the list argument the user provided
         # is only strings or only IPUMS objects. Raise a useful error and ask the user to fix themselves
-        if not all(isinstance(i, str) for i in list_arg) and not all(isinstance(i, IpumsObject) for i in list_arg):
-            raise TypeError(f"The items in {list_arg} must all be string type or {arg_obj} type.")
+        if not all(isinstance(i, str) for i in list_arg) and not all(
+            isinstance(i, IpumsObject) for i in list_arg
+        ):
+            raise TypeError(
+                f"The items in {list_arg} must all be string type or {arg_obj} type."
+            )
         if isinstance(list_arg, dict) and arg_obj is Variable:
             args = _unpack_variables_dict(list_arg)
             return args
@@ -287,7 +293,6 @@ def _validate_list_args(self, list_arg, arg_obj):
             # and return a list of the relevant objects
             unique_list = list(dict.fromkeys(list_arg))
             return [arg_obj(i) for i in unique_list]
-
 
     def extract_api_version(self, kwargs_dict: Dict[str, Any]) -> str:
         # check to see if version is specified in kwargs_dict
@@ -352,7 +357,7 @@ def __init__(
     ):
         """
         Class for defining an extract for an IPUMS microdata collection.
-        
+
         Args:
             collection: name of an IPUMS data collection
             samples: list of sample IDs from an IPUMS microdata collection
@@ -362,15 +367,15 @@ def __init__(
             data_structure: nested dict with "rectangular", "hierarchical", or "household-only" as first-level key.
                             "rectangular" extracts require further specification of "on" : <record type>.
                             Default {"rectangular": "on": "P"} requests an extract rectangularized on the "P" record.
-            time_use_variables: a list of IPUMS Time Use Variable names or Objects. This argument is only valid for IPUMS ATUS, 
-                                MTUS, and AHTUS data collections. If the list contains user-created Time Use Variables, these 
+            time_use_variables: a list of IPUMS Time Use Variable names or Objects. This argument is only valid for IPUMS ATUS,
+                                MTUS, and AHTUS data collections. If the list contains user-created Time Use Variables, these
                                 must be passed as a list of TimeUseVariable objects with the 'owner' field specified.
-                            
+
         Keyword Args:
             data_quality_flags: a boolean value which, if True, adds the data quality flags for each variable included in the `variables` list
                                 if a data quality flag exists for that variable.
             sample_members: a dictionary of non-default sample members to include for Time Use collections where keys are strings
-                            indicating sample member type and values are boolean. This argument is only valid for IPUMS ATUS, 
+                            indicating sample member type and values are boolean. This argument is only valid for IPUMS ATUS,
                             MTUS, and AHTUS data collections. Valid keys include 'include_non_respondents' and 'include_household_members'.
         """
 
@@ -396,23 +401,27 @@ def __init__(
         self._kwarg_warning(kwargs)
         # make the kwargs camelCase
         self.kwargs = self._snake_to_camel(kwargs)
-        
+
         # I don't love this, but it also seems overkill to make a seperate extract class
         # just for these features
         self.time_use_variables = time_use_variables
         if self.time_use_variables is not None:
             # XXX remove when the server-side error messaging is improved
             if self.collection in ["atus", "mtus", "ahtus"]:
-                self.time_use_variables = self._validate_list_args(self.time_use_variables, TimeUseVariable)
+                self.time_use_variables = self._validate_list_args(
+                    self.time_use_variables, TimeUseVariable
+                )
             else:
-                raise ValueError(f"Time use variables are unavailable for the IPUMS {self.collection.upper()} data collection")
+                raise ValueError(
+                    f"Time use variables are unavailable for the IPUMS {self.collection.upper()} data collection"
+                )
 
     def build(self) -> Dict[str, Any]:
         """
         Convert the object into a dictionary to be passed to the IPUMS API
         as a JSON string
         """
-        built =  {
+        built = {
             "description": self.description,
             "dataFormat": self.data_format,
             "dataStructure": self.data_structure,
@@ -426,15 +435,17 @@ def build(self) -> Dict[str, Any]:
         }
 
         if self.time_use_variables is not None:
-            built["timeUseVariables"] = {tuv.name.upper(): tuv.build() for tuv in self.time_use_variables}
-
+            built["timeUseVariables"] = {
+                tuv.name.upper(): tuv.build() for tuv in self.time_use_variables
+            }
+
         # XXX shoehorn fix until server-side bug is fixed
         if self.collection == "meps":
             for variable in built["variables"].keys():
                 built["variables"][variable].pop("attachedCharacteristics")
-            
+
         return built
-        
+
     def attach_characteristics(self, variable: Union[Variable, str], of: List[str]):
         """
         A method to update existing IPUMS Extract Variable objects
@@ -494,7 +505,6 @@ def select_cases(
             )
 
 
-
 def extract_from_dict(dct: Dict[str, Any]) -> Union[BaseExtract, List[BaseExtract]]:
     """
     Convert an extract that is currently specified as a dictionary (usually from a file)
@@ -510,13 +520,13 @@ def extract_from_dict(dct: Dict[str, Any]) -> Union[BaseExtract, List[BaseExtrac
     if "extracts" in dct:
         # We are returning several extracts
         return [extract_from_dict(extract) for extract in dct["extracts"]]
-    
+
     def _camel_to_snake(key):
         # don't mess with case for boolean values
         if isinstance(key, bool):
             return key
         cap_idx = [0] + [key.index(i) for i in key if i.isupper()]
-        parts_list = [key[i:j].lower() for i,j in zip(cap_idx, cap_idx[1:]+[None])]
+        parts_list = [key[i:j].lower() for i, j in zip(cap_idx, cap_idx[1:] + [None])]
         snake = "_".join(parts_list)
         return snake
 
@@ -525,8 +535,8 @@ def _make_snake_ext(ext_dict):
             if isinstance(ext_dict[key], dict):
                 if key not in ["variables", "samples", "timeUseVariables"]:
                     ext_dict[key] = _make_snake_ext(ext_dict[key])
-        return {_camel_to_snake(k):v for k,v in ext_dict.items()}
-    
+        return {_camel_to_snake(k): v for k, v in ext_dict.items()}
+
     ext_dict = _make_snake_ext(dct)
     # XXX To Do: When MicrodataExtract is no longer the only extract class,
     # this method will need to differentiate between the different collection types

diff --git a/src/ipumspy/ddi.py b/src/ipumspy/ddi.py
@@ -72,7 +72,11 @@ def numpy_type(self) -> type:
         and hence even for integers it is "float64".
         """
         # always return a numpy float if it isn't a character var
-        if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
+        if (
+            self.vartype == "numeric"
+            or self.vartype == "integer"
+            or self.vartype == "float"
+        ):
             return np.float64
         return str
 
@@ -101,7 +105,11 @@ def pandas_type_efficient(self) -> type:
         https://pandas-docs.github.io/pandas-docs-travis/user_guide/integer_na.html
         It can be considered as a mix between `self.pandas_type` and `self.numpy_type`
         """
-        if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
+        if (
+            self.vartype == "numeric"
+            or self.vartype == "integer"
+            or self.vartype == "float"
+        ):
             return np.float64
         return pd.StringDtype()
 
@@ -127,7 +135,7 @@ def read(cls, elt: Element, ddi_namespace: str) -> VariableDescription:
                 vartype == "float"
             else:
                 vartype = "integer"
-        
+
         labels_dict = {}
         for cat in elt.findall("./ddi:catgry", namespaces):
             label = cat.find("./ddi:labl", namespaces).text