Skip to content

Commit

Permalink
introduce float var_type and update data types accordingly
Browse files Browse the repository at this point in the history
  • Loading branch information
renae-r committed Jun 13, 2024
1 parent 34919e0 commit 8fbc598
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 19 deletions.
35 changes: 24 additions & 11 deletions src/ipumspy/ddi.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,23 @@ def python_type(self) -> type:
"""
The Python type of this variable.
"""
if self.vartype == "numeric":
if self.vartype == "numeric" or self.vartype == "integer":
if (self.shift is None) or (self.shift == 0):
return int
return float
return str
elif self.vartype == "float":
return float
else:
return str

@property
def numpy_type(self) -> type:
"""
The Numpy type of this variable. Note that this type must support nullability,
and hence even for integers it is "float64".
"""
if self.vartype == "numeric":
if (self.shift is None) or (self.shift == 0):
return np.float64
# always return a numpy float if it isn't a character var
if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
return np.float64
return str

Expand All @@ -81,11 +83,15 @@ def pandas_type(self) -> type:
pandas dtypes, and so the integer type is "Int64" and the string type is
"string" (instead of "object")
"""
if self.vartype == "numeric":
if self.vartype == "numeric" or self.vartype == "integer":
if (self.shift is None) or (self.shift == 0):
return pd.Int64Dtype()
# this should probably actually return pd.Float64Dtype()
return np.float64
return pd.StringDtype()
elif self.vartype == "float":
return np.float64
else:
return pd.StringDtype()

@property
def pandas_type_efficient(self) -> type:
Expand All @@ -95,9 +101,7 @@ def pandas_type_efficient(self) -> type:
https://pandas-docs.github.io/pandas-docs-travis/user_guide/integer_na.html
It can be considered as a mix between `self.pandas_type` and `self.numpy_type`
"""
if self.vartype == "numeric":
if (self.shift is None) or (self.shift == 0):
return np.float64
if self.vartype == "numeric" or self.vartype == "integer" or self.vartype == "float":
return np.float64
return pd.StringDtype()

Expand All @@ -115,12 +119,21 @@ def read(cls, elt: Element, ddi_namespace: str) -> VariableDescription:
namespaces = {"ddi": ddi_namespace}

vartype = elt.find("./ddi:varFormat", namespaces).attrib["type"]
# for DDI where no distinction is made between integer and float:
intvl = elt.attrib["intrvl"]
wid = int(elt.find("./ddi:location", namespaces).attrib["width"])
if vartype == "numeric":
if intvl == "contin" and wid > 10:
vartype == "float"
else:
vartype = "integer"

labels_dict = {}
for cat in elt.findall("./ddi:catgry", namespaces):
label = cat.find("./ddi:labl", namespaces).text
value = cat.find("./ddi:catValu", namespaces).text
# make values integers when possible
if vartype == "numeric":
if vartype == "numeric" or vartype == "integer":
labels_dict[label] = int(value)
else:
labels_dict[label] = value
Expand Down
16 changes: 8 additions & 8 deletions tests/test_ddi.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,14 @@ def test_ddi_codebook_hierarchical(cps_ddi_hierarchical: ddi.Codebook):
def test_get_all_types(cps_ddi: ddi.Codebook, cps_df: pd.DataFrame):

var_types = {
"YEAR": "numeric",
"SERIAL": "numeric",
"HWTSUPP": "numeric",
"STATEFIP": "numeric",
"MONTH": "numeric",
"PERNUM": "numeric",
"WTSUPP": "numeric",
"INCTOT": "numeric",
"YEAR": "integer",
"SERIAL": "integer",
"HWTSUPP": "integer",
"STATEFIP": "integer",
"MONTH": "integer",
"PERNUM": "integer",
"WTSUPP": "integer",
"INCTOT": "integer",
}

assert cps_ddi.get_all_types(type_format="vartype") == var_types
Expand Down

0 comments on commit 8fbc598

Please sign in to comment.