gnosis · kongzii · Feb 20, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/prediction_market_agent_tooling/benchmark/agents.py b/prediction_market_agent_tooling/benchmark/agents.py
@@ -1,8 +1,8 @@
 import random
 import typing as t
+from datetime import datetime
 
 from prediction_market_agent_tooling.benchmark.utils import (
-    EvaluatedQuestion,
     OutcomePrediction,
     Prediction,
 )
@@ -13,51 +13,84 @@ def __init__(self, agent_name: str, max_workers: t.Optional[int] = None):
         self.agent_name = agent_name
         self.max_workers = max_workers  # Limit the number of workers that can run this worker in parallel threads
 
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        raise NotImplementedError
+    def is_predictable(self, market_question: str) -> bool:
+        """
+        Override if the agent can decide to not predict the question, before doing the hard work.
+        """
+        return True
 
-    def research(self, market_question: str) -> t.Optional[str]:
+    def predict(self, market_question: str) -> Prediction:
+        """
+        Predict the outcome of the market question.
+        """
         raise NotImplementedError
 
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
+    def check_and_predict(self, market_question: str) -> Prediction:
+        is_predictable = self.is_predictable(market_question=market_question)
+        if not is_predictable:
+            return Prediction(is_predictable=is_predictable)
+        return self.predict(market_question=market_question)
+
+    def is_predictable_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
+    ) -> bool:
+        """
+        Override if the agent can decide to not predict the question, before doing the hard work.
+
+        Data used for the evaluation must be restricted to the time_restriction_up_to.
+        """
+        return True
+
+    def predict_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
     ) -> Prediction:
+        """
+        Predict the outcome of the market question.
+
+        Data used for the prediction must be restricted to the time_restriction_up_to.
+        """
         raise NotImplementedError
 
-    def evaluate_research_predict(self, market_question: str) -> Prediction:
-        eval = self.evaluate(market_question=market_question)
-        if not eval.is_predictable:
-            return Prediction(evaluation=eval)
-        researched = self.research(market_question=market_question)
-        if researched is None:
-            return Prediction(evaluation=eval)
-        return self.predict(
+    def check_and_predict_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
+    ) -> Prediction:
+        """
+        Data used must be restricted to the time_restriction_up_to.
+        """
+        is_predictable = self.is_predictable_restricted(
+            market_question=market_question,
+            time_restriction_up_to=time_restriction_up_to,
+        )
+        if not is_predictable:
+            return Prediction(is_predictable=is_predictable)
+        return self.predict_restricted(
             market_question=market_question,
-            researched=researched,
-            evaluated=eval,
+            time_restriction_up_to=time_restriction_up_to,
         )
 
 
 class RandomAgent(AbstractBenchmarkedAgent):
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        return EvaluatedQuestion(question=market_question, is_predictable=True)
-
-    def research(self, market_question: str) -> str:
-        return ""  # No research for a random agent, but can't be None.
-
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
-    ) -> Prediction:
+    def predict(self, market_question: str) -> Prediction:
         p_yes, confidence = random.random(), random.random()
         return Prediction(
-            evaluation=evaluated,
             outcome_prediction=OutcomePrediction(
                 p_yes=p_yes,
                 confidence=confidence,
                 info_utility=None,
             ),
         )
 
+    def predict_restricted(
+        self, market_question: str, time_restriction_up_to: datetime
+    ) -> Prediction:
+        return self.predict(market_question)
+
 
 class FixedAgent(AbstractBenchmarkedAgent):
     def __init__(
@@ -66,21 +99,17 @@ def __init__(
         super().__init__(agent_name, max_workers)
         self.fixed_answer = fixed_answer
 
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        return EvaluatedQuestion(question=market_question, is_predictable=True)
-
-    def research(self, market_question: str) -> str:
-        return ""  # No research for a fixed agent, but can't be None.
-
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
-    ) -> Prediction:
+    def predict(self, market_question: str) -> Prediction:
         p_yes, confidence = 1.0 if self.fixed_answer else 0.0, 1.0
         return Prediction(
-            evaluation=evaluated,
             outcome_prediction=OutcomePrediction(
                 p_yes=p_yes,
                 confidence=confidence,
                 info_utility=None,
             ),
         )
+
+    def predict_restricted(
+        self, market_question: str, time_restriction_up_to: datetime
+    ) -> Prediction:
+        return self.predict(market_question)
diff --git a/prediction_market_agent_tooling/benchmark/benchmark.py b/prediction_market_agent_tooling/benchmark/benchmark.py
@@ -13,6 +13,7 @@
 from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent
 from prediction_market_agent_tooling.benchmark.utils import (
     Market,
+    MarketResolution,
     Prediction,
     PredictionsCache,
     get_llm_api_call_cost,
@@ -134,8 +135,13 @@ def run_agents(self, enable_timing: bool = True) -> None:
             def get_prediction_result(market: Market) -> tuple[str, Prediction]:
                 with get_openai_callback() as cb:
                     start = time.time()
-                    prediction = agent.evaluate_research_predict(
-                        market_question=market.question
+                    prediction = (
+                        agent.check_and_predict(market_question=market.question)
+                        if not market.is_resolved
+                        else agent.check_and_predict_restricted(
+                            market_question=market.question,
+                            time_restriction_up_to=market.created_time,  # TODO: Add support for resolved_at and any time in between.
+                        )
                     )
 
                     prediction.time = time.time() - start if enable_timing else None
@@ -263,8 +269,9 @@ def _compute_correct_outcome_percentage(
 
         correct_outcome_count = 0
         for p, m in zip(predictions, markets):
-            if (check_not_none(p.outcome_prediction).p_yes > 0.5 and m.p_yes > 0.5) or (
-                check_not_none(p.outcome_prediction).p_yes < 0.5 and m.p_yes < 0.5
+            if (
+                check_not_none(p.outcome_prediction).probable_resolution
+                == m.probable_resolution
             ):
                 correct_outcome_count += 1
 
@@ -279,8 +286,18 @@ def _compute_precision_and_recall_percentages(
         if not predictions:
             return None, None
 
-        ground_truth = [m.p_yes > 0.5 for m in markets]
-        y_pred = [check_not_none(p.outcome_prediction).p_yes > 0.5 for p in predictions]
+        ground_truth = [
+            (1 if m.probable_resolution == MarketResolution.YES else 0) for m in markets
+        ]
+        y_pred = [
+            (
+                1
+                if check_not_none(p.outcome_prediction).probable_resolution
+                == MarketResolution.YES
+                else 0
+            )
+            for p in predictions
+        ]
 
         precision = precision_score(
             ground_truth, y_pred, pos_label=pos_label, zero_division=0.0
@@ -332,9 +349,7 @@ def _compute_mean_time(
     def _compute_ratio_evaluated_as_answerable(
         self, predictions: t.List[Prediction], markets: t.List[Market]
     ) -> float:
-        return sum(
-            1 for p in predictions if p.evaluation and p.evaluation.is_predictable
-        ) / len(predictions)
+        return sum(1 for p in predictions if p.is_predictable) / len(predictions)
 
     def _compute_ratio_answered(
         self, predictions: t.List[Prediction], markets: t.List[Market]
@@ -374,27 +389,27 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str | float]]:
             ]
             markets_summary[f"{agent} p_yes"] = [
                 (
-                    p.outcome_prediction.p_yes
-                    if p.evaluation
-                    and p.evaluation.is_predictable
+                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
+                    if p.is_predictable
                     and p.outcome_prediction  # Is answerable and answered
-                    else "N/A"
-                    if not p.evaluation
-                    and not p.outcome_prediction  # Not evaluated for some reason
-                    else "S"
-                    if p.evaluation
-                    and not p.evaluation.is_predictable  # Skipped (evaluated to be not predictable)
-                    else "F"
-                    if p.evaluation
-                    and p.evaluation.is_predictable
-                    and not p.outcome_prediction  # Failed (no prediction)
-                    else should_not_happen(
-                        f"Unexpected case in get_markets_summary() for {p}."
+                    else (
+                        "S"
+                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
+                        else (
+                            "F"
+                            if p.is_predictable
+                            and not p.outcome_prediction  # Failed (no prediction)
+                            else should_not_happen(
+                                f"Unexpected case in get_markets_summary() for {p}."
+                            )
+                        )
-                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
-                    if p.is_predictable
-                    and p.outcome_prediction  # Is answerable and answered
-                    else "N/A"
-                    if not p.evaluation
-                    and not p.outcome_prediction  # Not evaluated for some reason
-                    else "S"
-                    if p.evaluation
-                    and not p.evaluation.is_predictable  # Skipped (evaluated to be not predictable)
-                    else "F"
-                    if p.evaluation
-                    and p.evaluation.is_predictable
-                    and not p.outcome_prediction  # Failed (no prediction)
-                    else should_not_happen(
-                        f"Unexpected case in get_markets_summary() for {p}."
-                    else (
-                        "S"
-                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
-                        else (
-                            "F"
-                            if p.is_predictable
-                            and not p.outcome_prediction  # Failed (no prediction)
-                            else should_not_happen(
-                                f"Unexpected case in get_markets_summary() for {p}."
-                            )
-                        )
+                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
+                    if p.is_predictable
+                    and p.outcome_prediction  # Is answerable and answered
+                    else (
+                        "S"
+                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
+                        else (
+                            "F"
+                            if p.is_predictable
+                            and not p.outcome_prediction  # Failed (no prediction)
+                            else "Unexpected case"  # Log or handle unexpected cases more gracefully
-                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
-                    if p.is_predictable
-                    and p.outcome_prediction  # Is answerable and answered
-                    else "N/A"
-                    if not p.evaluation
-                    and not p.outcome_prediction  # Not evaluated for some reason
-                    else "S"
-                    if p.evaluation
-                    and not p.evaluation.is_predictable  # Skipped (evaluated to be not predictable)
-                    else "F"
-                    if p.evaluation
-                    and p.evaluation.is_predictable
-                    and not p.outcome_prediction  # Failed (no prediction)
-                    else should_not_happen(
-                        f"Unexpected case in get_markets_summary() for {p}."
-                    else (
-                        "S"
-                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
-                        else (
-                            "F"
-                            if p.is_predictable
-                            and not p.outcome_prediction  # Failed (no prediction)
-                            else should_not_happen(
-                                f"Unexpected case in get_markets_summary() for {p}."
-                            )
-                        )
+                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
+                    if p.is_predictable
+                    and p.outcome_prediction  # Is answerable and answered
+                    else (
+                        "S"
+                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
+                        else (
+                            "F"
+                            if p.is_predictable
+                            and not p.outcome_prediction  # Failed (no prediction)
+                            else "Unexpected case"  # Log or handle unexpected cases more gracefully
                     )
                 )
                 for p in agent_predictions
             ]
-        markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
+        markets_summary[f"reference p_yes"] = [
+            f"{m.p_yes:.2f} [{m.probable_resolution}]" for m in self.markets
+        ]
         return markets_summary
 
     def calculate_expected_returns(
@@ -409,21 +424,20 @@ def calculate_expected_returns(
         # TODO: Add support for different bet sizes -- if we bet a low amount (such as <10 units), the real shares will be very close to that we calculate below (bet_units / share_price),
         # but if one bets a lot, it will change the share price along the way, and so he/she receives less than `bet_units / share_price`, but it's more complicated to calculate.
         bet_units = 10  # Assuming the agent always bet 10 units per market.
-        buy_yes_threshold = 0.5  # If the agent's prediction is > 50% it should buy "yes", otherwise "no".
 
         assert prediction.outcome_prediction is not None
         # Assume that market starts at 50/50 and so the price is 0.5 at the time we are buying it,
         # we can't use {yes,no}_outcome_price atm, because it would just cancel out to EV = 0.0,
         # as it's the same as the probability.
         yes_shares = (
             bet_units / 0.5  # market.yes_outcome_price
-            if prediction.outcome_prediction.p_yes > buy_yes_threshold
+            if prediction.outcome_prediction.probable_resolution == MarketResolution.YES
             and market.yes_outcome_price > 0
             else 0
         )
         no_shares = (
             bet_units / 0.5  # market.no_outcome_price
-            if prediction.outcome_prediction.p_yes <= buy_yes_threshold
+            if prediction.outcome_prediction.probable_resolution == MarketResolution.NO
             and market.no_outcome_price > 0
             else 0
         )