All-Hands-AI · enyst · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/openhands/events/stream.py b/openhands/events/stream.py
@@ -428,3 +428,41 @@ def get_matching_events(
                 break
 
         return matching_events
+
+    def get_metrics(self):
+        """Get the accumulated metrics from all events in the stream.
+
+        This method extracts metrics from events that contain them and returns
+        the aggregated metrics object.
+
+        Returns:
+            Metrics: The metrics object containing accumulated cost and token usage data.
+            Returns None if no metrics are found.
+        """
+        from openhands.llm.metrics import Metrics
+
+        # Look for events with metrics
+        metrics = None
+        events_with_metrics = []
+
+        try:
+            # First collect all events with metrics
+            for event in self.get_events():
+                if hasattr(event, 'llm_metrics') and event.llm_metrics is not None:
+                    events_with_metrics.append(event)
+
+            # Then merge them if any were found
+            if events_with_metrics:
+                # Get the first event with metrics to initialize our metrics object
+                first_event = events_with_metrics[0]
+                if first_event.llm_metrics is not None:
+                    metrics = Metrics(model_name=first_event.llm_metrics.model_name)
+
+                    # Merge metrics from all events
+                    for event in events_with_metrics:
+                        if event.llm_metrics is not None:
+                            metrics.merge(event.llm_metrics)
+        except Exception as e:
+            logger.error(f'Error retrieving metrics from events: {e}')
+
+        return metrics
diff --git a/openhands/server/routes/conversation.py b/openhands/server/routes/conversation.py
@@ -7,6 +7,74 @@
 app = APIRouter(prefix='/api/conversations/{conversation_id}')
 
 
+@app.get('/metrics')
+async def get_conversation_metrics(request: Request):
+    """Retrieve the conversation metrics.
+
+    This endpoint returns the accumulated cost and token usage metrics for the conversation.
+    Metrics are retrieved directly from the runtime's state rather than reconstructing from events,
+    providing a more accurate representation of costs, including those not associated with events.
+
+    Args:
+        request (Request): The incoming FastAPI request object.
+
+    Returns:
+        JSONResponse: A JSON response containing the metrics data.
+    """
+    try:
+        if not hasattr(request.state, 'conversation'):
+            return JSONResponse(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                content={'error': 'No conversation found in request state'},
+            )
+
+        conversation = request.state.conversation
+
+        # Get metrics directly from the conversation's runtime state
+        metrics = conversation.get_metrics()
+
+        # If no metrics from state, fall back to event stream metrics for backward compatibility
+        if not metrics and hasattr(conversation.event_stream, 'get_metrics'):
+            metrics = conversation.event_stream.get_metrics()
+
+        if not metrics:
+            # Return empty metrics if not available
+            return JSONResponse(
+                status_code=status.HTTP_200_OK,
+                content={
+                    'accumulated_cost': 0.0,
+                    'total_prompt_tokens': 0,
+                    'total_completion_tokens': 0,
+                    'total_tokens': 0,
+                },
+            )
+
+        # Calculate total tokens
+        total_prompt_tokens = sum(usage.prompt_tokens for usage in metrics.token_usages)
+        total_completion_tokens = sum(
+            usage.completion_tokens for usage in metrics.token_usages
+        )
+        total_tokens = total_prompt_tokens + total_completion_tokens
+
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={
+                'accumulated_cost': metrics.accumulated_cost,
+                'total_prompt_tokens': total_prompt_tokens,
+                'total_completion_tokens': total_completion_tokens,
+                'total_tokens': total_tokens,
+            },
+        )
+    except Exception as e:
+        logger.error(f'Error getting conversation metrics: {e}')
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={
+                'error': f'Error getting conversation metrics: {e}',
+            },
+        )
+
+
 @app.get('/config')
 async def get_remote_runtime_config(request: Request):
     """Retrieve the runtime configuration.

diff --git a/openhands/server/session/conversation.py b/openhands/server/session/conversation.py
@@ -46,3 +46,24 @@ async def disconnect(self):
         if self.event_stream:
             self.event_stream.close()
         asyncio.create_task(call_sync_from_async(self.runtime.close))
+
+    def get_metrics(self):
+        """Get metrics directly from the runtime's state.
+
+        This method retrieves metrics from the runtime's state object rather than
+        reconstructing them from events, providing a more accurate representation
+        of costs and token usage, including those not associated with events.
+
+        Returns:
+            Metrics: The metrics object containing accumulated cost and token usage data.
+            Returns None if no metrics are available or if the runtime has no state.
+        """
+        try:
+            if hasattr(self.runtime, 'state') and self.runtime.state:
+                return self.runtime.state.metrics
+            return None
+        except Exception as e:
+            from openhands.core.logger import openhands_logger as logger
+
+            logger.error(f'Error retrieving metrics from runtime state: {e}')
+            return None
diff --git a/tests/unit/events/test_stream.py b/tests/unit/events/test_stream.py
@@ -0,0 +1,129 @@
+from unittest.mock import MagicMock, patch
+
+from openhands.events.event import Event
+from openhands.events.stream import EventStream
+from openhands.llm.metrics import Metrics
+
+
+class TestEventStream:
+    def test_get_metrics_empty_stream(self):
+        """Test that get_metrics returns None for an empty stream."""
+        sid = 'test-stream-id'
+        file_store = MagicMock()
+        stream = EventStream(sid=sid, file_store=file_store)
+        assert stream.get_metrics() is None
+
+    def test_get_metrics_no_metrics_in_events(self):
+        """Test that get_metrics returns None when no events have metrics."""
+        sid = 'test-stream-id'
+        file_store = MagicMock()
+        stream = EventStream(sid=sid, file_store=file_store)
+        event = MagicMock(spec=Event)
+        event.llm_metrics = None
+
+        with patch.object(stream, 'get_events', return_value=[event]):
+            assert stream.get_metrics() is None
+
+    def test_get_metrics_with_metrics(self):
+        """Test that get_metrics correctly aggregates metrics from events."""
+        sid = 'test-stream-id'
+        file_store = MagicMock()
+        stream = EventStream(sid=sid, file_store=file_store)
+
+        # Create mock events with metrics
+        event1 = MagicMock(spec=Event)
+        metrics1 = Metrics(model_name='gpt-4')
+        metrics1.add_token_usage(
+            prompt_tokens=10,
+            completion_tokens=20,
+            cache_read_tokens=0,
+            cache_write_tokens=0,
+            response_id='resp1',
+        )
+        event1.llm_metrics = metrics1
+
+        event2 = MagicMock(spec=Event)
+        metrics2 = Metrics(model_name='gpt-4')
+        metrics2.add_token_usage(
+            prompt_tokens=15,
+            completion_tokens=25,
+            cache_read_tokens=0,
+            cache_write_tokens=0,
+            response_id='resp2',
+        )
+        event2.llm_metrics = metrics2
+
+        with patch.object(stream, 'get_events', return_value=[event1, event2]):
+            result = stream.get_metrics()
+
+            assert result is not None
+            assert result.model_name == 'gpt-4'
+            # Check token usages are merged correctly
+            total_prompt_tokens = sum(
+                usage.prompt_tokens for usage in result.token_usages
+            )
+            total_completion_tokens = sum(
+                usage.completion_tokens for usage in result.token_usages
+            )
+            assert total_prompt_tokens == 25  # 10 + 15
+            assert total_completion_tokens == 45  # 20 + 25
+            assert len(result.token_usages) == 2
+
+    def test_get_metrics_with_exception(self):
+        """Test that get_metrics handles exceptions gracefully."""
+        sid = 'test-stream-id'
+        file_store = MagicMock()
+        stream = EventStream(sid=sid, file_store=file_store)
+
+        with patch.object(
+            stream, 'get_events', side_effect=Exception('Test exception')
+        ):
+            assert stream.get_metrics() is None
+
+    def test_get_metrics_with_mixed_events(self):
+        """Test that get_metrics correctly handles a mix of events with and without metrics."""
+        sid = 'test-stream-id'
+        file_store = MagicMock()
+        stream = EventStream(sid=sid, file_store=file_store)
+
+        # Create mock events, some with metrics and some without
+        event1 = MagicMock(spec=Event)
+        metrics1 = Metrics(model_name='gpt-4')
+        metrics1.add_token_usage(
+            prompt_tokens=10,
+            completion_tokens=20,
+            cache_read_tokens=0,
+            cache_write_tokens=0,
+            response_id='resp1',
+        )
+        event1.llm_metrics = metrics1
+
+        event2 = MagicMock(spec=Event)
+        event2.llm_metrics = None
+
+        event3 = MagicMock(spec=Event)
+        metrics3 = Metrics(model_name='gpt-4')
+        metrics3.add_token_usage(
+            prompt_tokens=15,
+            completion_tokens=25,
+            cache_read_tokens=0,
+            cache_write_tokens=0,
+            response_id='resp3',
+        )
+        event3.llm_metrics = metrics3
+
+        with patch.object(stream, 'get_events', return_value=[event1, event2, event3]):
+            result = stream.get_metrics()
+
+            assert result is not None
+            assert result.model_name == 'gpt-4'
+            # Check token usages are merged correctly
+            total_prompt_tokens = sum(
+                usage.prompt_tokens for usage in result.token_usages
+            )
+            total_completion_tokens = sum(
+                usage.completion_tokens for usage in result.token_usages
+            )
+            assert total_prompt_tokens == 25  # 10 + 15
+            assert total_completion_tokens == 45  # 20 + 25
+            assert len(result.token_usages) == 2