Skip to content

Commit

Permalink
Merge branch 'main' into joss-paper
Browse files Browse the repository at this point in the history
  • Loading branch information
jmafoster1 authored Jan 10, 2025
2 parents 4c1410b + 641107f commit 8a8ba15
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 11 deletions.
44 changes: 33 additions & 11 deletions causal_testing/estimation/genetic_programming_regression_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def __init__(
)
self.sympy_conversions[name] = conversion

print(self.pset.mapping)
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

Expand Down Expand Up @@ -235,6 +234,8 @@ def simplify(self, expression: gp.PrimitiveTree) -> sympy.core.Expr:
:return: The simplified expression as a sympy Expr object.
"""
if isinstance(expression, str):
expression = creator.Individual(gp.PrimitiveTree.from_string(expression, self.pset))
return sympy.simplify(self._stringify_for_sympy(expression))

def repair(self, expression: gp.PrimitiveTree) -> gp.PrimitiveTree:
Expand All @@ -248,7 +249,7 @@ def repair(self, expression: gp.PrimitiveTree) -> gp.PrimitiveTree:
"""
eq = f"{self.outcome} ~ {' + '.join(str(x) for x in self.split(expression))}"
try:
# Create model, fit (run) it, give estimates from it]
# Create model, fit (run) it, give estimates from it
model = smf.ols(eq, self.df)
res = model.fit()

Expand Down Expand Up @@ -278,16 +279,25 @@ def fitness(self, expression: gp.PrimitiveTree) -> float:
"""
old_settings = np.seterr(all="raise")
try:
# Create model, fit (run) it, give estimates from it]
if isinstance(expression, str):
expression = creator.Individual(gp.PrimitiveTree.from_string(expression, self.pset))

# Create model, fit (run) it, give estimates from it
func = gp.compile(expression, self.pset)
y_estimates = pd.Series([func(**x) for _, x in self.df[self.features].iterrows()])
y_estimates = pd.Series(
[func(**x) for _, x in self.df[self.features].iterrows()],
index=self.df.index,
)

# Calc errors using an improved normalised mean squared
# Calculate errors using the normalised root mean square error (nrmse),
# which is normalised with respect to the range
sqerrors = (self.df[self.outcome] - y_estimates) ** 2
mean_squared = sqerrors.sum() / len(self.df)
nmse = mean_squared / (self.df[self.outcome].sum() / len(self.df))
nrmse = np.sqrt(sqerrors.sum() / len(self.df)) / (self.df[self.outcome].max() - self.df[self.outcome].min())

if pd.isnull(nrmse) or nrmse.real != nrmse or y_estimates.dtype != self.df.dtypes[self.outcome]:
return (float("inf"),)

return (nmse,)
return (nrmse,)

# Fitness value of infinite if error - not return 1
except (
Expand Down Expand Up @@ -321,18 +331,29 @@ def make_offspring(self, population: list, num_offspring: int) -> list:
offspring.append(child)
return offspring

def run_gp(self, ngen: int, pop_size: int = 20, num_offspring: int = 10, seeds: list = None) -> gp.PrimitiveTree:
# pylint: disable=too-many-arguments
def run_gp(
self,
ngen: int,
pop_size: int = 20,
num_offspring: int = 10,
seeds: list = None,
repair: bool = True,
) -> gp.PrimitiveTree:
"""
Execute Genetic Programming to find the best expression using a mu+lambda algorithm.
:param ngen: The maximum number of generations.
:param pop_size: The population size.
:param num_offspring: The number of new individuals per generation.
:param seeds: Seed individuals for the initial population.
:param repair: Whether to run the linear regression repair operator (defaults to True).
:return: The best candididate expression.
"""
population = [self.toolbox.repair(ind) for ind in self.toolbox.population(n=pop_size)]
population = self.toolbox.population(n=pop_size)
if repair:
population = [self.toolbox.repair(ind) for ind in population]
if seeds is not None:
for seed in seeds:
ind = creator.Individual(gp.PrimitiveTree.from_string(seed, self.pset))
Expand All @@ -348,7 +369,8 @@ def run_gp(self, ngen: int, pop_size: int = 20, num_offspring: int = 10, seeds:
for _ in range(1, ngen + 1):
# Vary the population
offspring = self.make_offspring(population, num_offspring)
offspring = [self.toolbox.repair(ind) for ind in offspring]
if repair:
offspring = [self.toolbox.repair(ind) for ind in offspring]

# Evaluate the individuals with an invalid fitness
for ind in offspring:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,43 @@
import unittest
import pandas as pd
from operator import sub

from causal_testing.estimation.genetic_programming_regression_fitter import GP


def root(x):
return x**0.5


class TestGP(unittest.TestCase):
def test_init_invalid_fun_name(self):
with self.assertRaises(ValueError):
GP(df=pd.DataFrame(), features=[], outcome="", max_order=2, sympy_conversions={"power_1": ""})

def test_simplify_string(self):
gp = GP(
df=None,
features=["x1"],
outcome=None,
max_order=1,
)
self.assertEqual(str(gp.simplify("power_1(x1)")), "x1")

def test_fitness(self):
gp = GP(
df=pd.DataFrame({"x1": [1, 2, 3], "outcome": [2, 3, 4]}),
features=["x1"],
outcome="outcome",
max_order=0,
)
self.assertEqual(gp.fitness("add(x1, 1)"), (0,))

def test_fitness_inf(self):
gp = GP(
df=pd.DataFrame({"x1": [1, 2, 3], "outcome": [2, 3, 4]}),
features=["x1"],
outcome="outcome",
max_order=0,
extra_operators=[(root, 1)],
)
self.assertEqual(gp.fitness("root(-1)"), (float("inf"),))

0 comments on commit 8a8ba15

Please sign in to comment.