introduce float var_type and update data types accordingly

ipums · Jun 13, 2024 · 8fbc598 · 8fbc598
1 parent 34919e0
commit 8fbc598
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 19 deletions.
diff --git a/src/ipumspy/ddi.py b/src/ipumspy/ddi.py
@@ -56,21 +56,23 @@ def python_type(self) -> type:
         """
         The Python type of this variable.
         """
-        if self.vartype == "numeric":
+        if self.vartype == "numeric" or self.vartype == "integer":
             if (self.shift is None) or (self.shift == 0):
                 return int
             return float
-        return str
+        elif self.vartype == "float":
+            return float
+        else:
+            return str
 
     @property
     def numpy_type(self) -> type:
         """
         The Numpy type of this variable. Note that this type must support nullability,
         and hence even for integers it is "float64".
         """
-        if self.vartype == "numeric":
-            if (self.shift is None) or (self.shift == 0):
-                return np.float64
+        # always return a numpy float if it isn't a character var
+        if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
             return np.float64
         return str
 
@@ -81,11 +83,15 @@ def pandas_type(self) -> type:
         pandas dtypes, and so the integer type is "Int64" and the string type is
         "string" (instead of "object")
         """
-        if self.vartype == "numeric":
+        if self.vartype == "numeric" or self.vartype == "integer":
             if (self.shift is None) or (self.shift == 0):
                 return pd.Int64Dtype()
+            # this should probably actually return pd.Float64Dtype()
             return np.float64
-        return pd.StringDtype()
+        elif self.vartype == "float":
+            return np.float64
+        else:
+            return pd.StringDtype()
 
     @property
     def pandas_type_efficient(self) -> type:
@@ -95,9 +101,7 @@ def pandas_type_efficient(self) -> type:
         https://pandas-docs.github.io/pandas-docs-travis/user_guide/integer_na.html
         It can be considered as a mix between `self.pandas_type` and `self.numpy_type`
         """
-        if self.vartype == "numeric":
-            if (self.shift is None) or (self.shift == 0):
-                return np.float64
+        if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
             return np.float64
         return pd.StringDtype()
 
@@ -115,12 +119,21 @@ def read(cls, elt: Element, ddi_namespace: str) -> VariableDescription:
         namespaces = {"ddi": ddi_namespace}
 
         vartype = elt.find("./ddi:varFormat", namespaces).attrib["type"]
+        # for DDI where no distinction is made between integer and float:
+        intvl = elt.attrib["intrvl"]
+        wid = int(elt.find("./ddi:location", namespaces).attrib["width"])
+        if vartype == "numeric":
+            if intvl == "contin" and wid > 10:
+                vartype == "float"
+            else:
+                vartype = "integer"
+
         labels_dict = {}
         for cat in elt.findall("./ddi:catgry", namespaces):
             label = cat.find("./ddi:labl", namespaces).text
             value = cat.find("./ddi:catValu", namespaces).text
             # make values integers when possible
-            if vartype == "numeric":
+            if vartype == "numeric" or vartype == "integer":
                 labels_dict[label] = int(value)
             else:
                 labels_dict[label] = value

diff --git a/tests/test_ddi.py b/tests/test_ddi.py
@@ -279,14 +279,14 @@ def test_ddi_codebook_hierarchical(cps_ddi_hierarchical: ddi.Codebook):
 def test_get_all_types(cps_ddi: ddi.Codebook, cps_df: pd.DataFrame):
 
     var_types = {
-        "YEAR": "numeric",
-        "SERIAL": "numeric",
-        "HWTSUPP": "numeric",
-        "STATEFIP": "numeric",
-        "MONTH": "numeric",
-        "PERNUM": "numeric",
-        "WTSUPP": "numeric",
-        "INCTOT": "numeric",
+        "YEAR": "integer",
+        "SERIAL": "integer",
+        "HWTSUPP": "integer",
+        "STATEFIP": "integer",
+        "MONTH": "integer",
+        "PERNUM": "integer",
+        "WTSUPP": "integer",
+        "INCTOT": "integer",
     }
 
     assert cps_ddi.get_all_types(type_format="vartype") == var_types