Skip to content

Commit

Permalink
Merge pull request #17 from staadecker/ms/handle-missing
Browse files Browse the repository at this point in the history
Implement `drop_unmatched`, `keep_unmatched` and `add_dim`
  • Loading branch information
staadecker authored Mar 27, 2024
2 parents 2571fec + af13b62 commit fcedb8a
Show file tree
Hide file tree
Showing 16 changed files with 780 additions and 342 deletions.
13 changes: 4 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,14 @@ jobs:
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python 3.10
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.8'
- name: Install dependencies
run: pip install .
- name: Run ATS
uses: codecov/codecov-ats@v0
env:
CODECOV_STATIC_TOKEN: ${{ secrets.CODECOV_STATIC_TOKEN }}
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
run: pip install --editable .[dev]
- name: Run tests and collect coverage
run: pytest --cov app ${{ env.CODECOV_ATS_TESTS }}
run: pytest --cov=src/pyoframe
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4-beta
with:
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Pyoframe

[![codecov](https://codecov.io/gh/staadecker/pyoframe/graph/badge.svg?token=8258XESRYQ)](https://codecov.io/gh/staadecker/pyoframe)

A library to rapidly formulate large optimization models using Pandas or Polars dataframes.

## Acknowledgments
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dev = [
"isort",
"pip-tools",
"pytest",
"pytest-cov",
"pre-commit",
"gurobipy",
]
Expand Down
3 changes: 2 additions & 1 deletion src/pyoframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@

from pyoframe.monkey_patch import patch_dataframe_libraries
from pyoframe.constraints import Constraint, sum, sum_by, Set
from pyoframe.constants import Config
from pyoframe.variables import Variable
from pyoframe.model import Model
from pyoframe.objective import Objective
from pyoframe.constants import VType

patch_dataframe_libraries()

__all__ = ["sum", "sum_by", "Constraint", "Variable", "Model", "Objective", "Set", "VType"]
__all__ = ["sum", "sum_by", "Constraint", "Variable", "Model", "Objective", "Set", "VType", "Config"]
190 changes: 190 additions & 0 deletions src/pyoframe/arithmetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
from typing import TYPE_CHECKING, List, Optional
import polars as pl

from pyoframe.constants import COEF_KEY, RESERVED_COL_KEYS, VAR_KEY, UnmatchedStrategy

if TYPE_CHECKING:
from pyoframe.constraints import Expression


class PyoframeError(Exception):
pass


def add_expressions(*expressions: "Expression") -> "Expression":
try:
return add_expressions_internal(*expressions)
except PyoframeError as error:
raise PyoframeError(
"Failed to add expressions:\n"
+ " + ".join(
e.to_str(include_header=True, include_footer=False) for e in expressions
)
+ "\nDue to error:\n"
+ str(error)
) from error


def add_expressions_internal(*expressions: "Expression") -> "Expression":
assert len(expressions) > 1, "Need at least two expressions to add together."

dims = expressions[0].dimensions_unsafe
has_dim_conflict = any(
sorted(dims) != sorted(expr.dimensions_unsafe) for expr in expressions[1:]
)
requires_join = dims and any(
expr.unmatched_strategy != UnmatchedStrategy.KEEP for expr in expressions
)

# If we cannot use .concat compute the sum in a pairwise manner
if len(expressions) > 2 and (has_dim_conflict or requires_join):
result = expressions[0]
for expr in expressions[1:]:
result = add_expressions_internal(result, expr)
return result

if has_dim_conflict:
assert len(expressions) == 2
expressions = (
_add_dimension(expressions[0], expressions[1]),
_add_dimension(expressions[1], expressions[0]),
)
assert sorted(expressions[0].dimensions_unsafe) == sorted(
expressions[1].dimensions_unsafe
)

dims = expressions[0].dimensions_unsafe
# Check no dims conflict
assert all(
sorted(dims) == sorted(expr.dimensions_unsafe) for expr in expressions[1:]
)
if requires_join:
assert len(expressions) == 2
assert dims != []
left, right = expressions[0], expressions[1]

# Order so that drop always comes before keep, and keep always comes before error
if (left.unmatched_strategy, right.unmatched_strategy) in (
(UnmatchedStrategy.ERROR, UnmatchedStrategy.DROP),
(UnmatchedStrategy.ERROR, UnmatchedStrategy.KEEP),
(UnmatchedStrategy.KEEP, UnmatchedStrategy.DROP),
):
left, right = right, left

def get_indices(expr):
return expr.data.select(dims).unique(maintain_order=True)

left_data, right_data = left.data, right.data

strat = (left.unmatched_strategy, right.unmatched_strategy)

if strat == (UnmatchedStrategy.DROP, UnmatchedStrategy.DROP):
left_data = left.data.join(get_indices(right), how="inner", on=dims)
right_data = right.data.join(get_indices(left), how="inner", on=dims)
elif strat == (UnmatchedStrategy.ERROR, UnmatchedStrategy.ERROR):
outer_join = get_indices(left).join(
get_indices(right), how="outer", on=dims
)
if outer_join.get_column(dims[0]).null_count() > 0:
raise PyoframeError(
"Dataframe has unmatched values. If this is intentional, use .drop_unmatched() or .keep_unmatched()\n"
+ str(
outer_join.filter(outer_join.get_column(dims[0]).is_null())
)
)
if outer_join.get_column(dims[0] + "_right").null_count() > 0:
raise PyoframeError(
"Dataframe has unmatched values. If this is intentional, use .drop_unmatched() or .keep_unmatched()\n"
+ str(
outer_join.filter(
outer_join.get_column(dims[0] + "_right").is_null()
)
)
)
elif strat == (UnmatchedStrategy.DROP, UnmatchedStrategy.KEEP):
left_data = get_indices(right).join(left.data, how="left", on=dims)
elif strat == (UnmatchedStrategy.DROP, UnmatchedStrategy.ERROR):
left_data = get_indices(right).join(left.data, how="left", on=dims)
if left_data.get_column(COEF_KEY).null_count() > 0:
raise PyoframeError(
"Dataframe has unmatched values. If this is intentional, use .drop_unmatched() or .keep_unmatched()\n"
+ str(
left_data.filter(left_data.get_column(COEF_KEY).is_null())
)
)
elif strat == (UnmatchedStrategy.KEEP, UnmatchedStrategy.ERROR):
unmatched = right.data.join(get_indices(left), how="anti", on=dims)
if len(unmatched) > 0:
raise PyoframeError(
"Dataframe has unmatched values. If this is intentional, use .drop_unmatched() or .keep_unmatched()\n"
+ str(unmatched)
)
else:
assert False, "This code should've never been reached!"

expr_data = [left_data, right_data]
else:
expr_data = [expr.data for expr in expressions]

# Sort columns to allow for concat
expr_data = [e.select(sorted(e.columns)) for e in expr_data]

data = pl.concat(expr_data, how="vertical_relaxed")
data = data.group_by(dims + [VAR_KEY], maintain_order=True).sum()
return expressions[0]._new(data)


def _add_dimension(self: "Expression", target: "Expression") -> "Expression":
target_dims = target.dimensions
if target_dims is None:
return self
dims = self.dimensions
if dims is None:
dims_in_common = []
missing_dims = target_dims
else:
dims_in_common = [dim for dim in dims if dim in target_dims]
missing_dims = [dim for dim in target_dims if dim not in dims]

# We're already at the size of our target
if not missing_dims:
return self

if not set(missing_dims) <= set(self.allowed_new_dims):
raise PyoframeError(
f"Dataframe has missing dimensions {missing_dims}. If this is intentional, use .add_dim()\n{self.data}"
)

target_data = target.data.select(target_dims).unique(maintain_order=True)

if not dims_in_common:
return self._new(self.data.join(target_data, how="cross"))

# If drop, we just do an inner join to get into the shape of the other
if self.unmatched_strategy == UnmatchedStrategy.DROP:
return self._new(self.data.join(target_data, on=dims_in_common, how="inner"))

result = self.data.join(target_data, on=dims_in_common, how="left")
right_has_missing = result.get_column(missing_dims[0]).null_count() > 0
if right_has_missing:
raise PyoframeError(
f"Cannot add dimension {missing_dims} since it contains unmatched values. If this is intentional, consider using .drop_unmatched()"
)
return self._new(result)


def get_dimensions(df: pl.DataFrame) -> Optional[List[str]]:
"""
Returns the dimensions of the DataFrame. Reserved columns do not count as dimensions.
If there are no dimensions, returns None to force caller to handle this special case.
Examples
--------
>>> import polars as pl
>>> get_dimensions(pl.DataFrame({"x": [1, 2, 3], "y": [1, 2, 3]}))
['x', 'y']
>>> get_dimensions(pl.DataFrame({"__variable_id": [1, 2, 3]}))
"""
result = [col for col in df.columns if col not in RESERVED_COL_KEYS]
return result if result else None
9 changes: 9 additions & 0 deletions src/pyoframe/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

RESERVED_COL_KEYS = (COEF_KEY, VAR_KEY)

class Config:
disable_unmatched_checks = False


class ConstraintSense(Enum):
LE = "<="
Expand All @@ -31,6 +34,12 @@ class VType(Enum):
INTEGER = "integer"


class UnmatchedStrategy(Enum):
ERROR = "error"
DROP = "drop"
KEEP = "keep"


# This is a hack to get the Literal type for VType
# See: https://stackoverflow.com/questions/67292470/type-hinting-enum-member-value-in-python
ObjSenseValue = Literal["minimize", "maximize"]
Expand Down
Loading

0 comments on commit fcedb8a

Please sign in to comment.