Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added automatic json extraction from the response #21

Merged
merged 4 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion allms/domain/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
from allms.domain.input_data import InputData


class ResponseParsingOutput(BaseModel):
response: typing.Optional[typing.Any]
error_message: typing.Optional[str]


class ResponseData(BaseModel):
response: typing.Optional[typing.Any] = None
input_data: typing.Optional[InputData] = None

number_of_prompt_tokens: typing.Optional[int] = None
number_of_generated_tokens: typing.Optional[int] = None
error: typing.Optional[typing.Union[str, Exception]] = None
error: typing.Optional[str] = None

# Without this, only classes inheriting from the pydantic BaseModel are allowed as field types. Exception isn't
# such a class and that's why we need it.
Expand Down
34 changes: 3 additions & 31 deletions allms/models/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from langchain.chat_models.base import BaseChatModel
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.schema import OutputParserException
from langchain_core.language_models.llms import create_base_retry_decorator
from langchain_core.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
Expand All @@ -34,6 +33,7 @@
from allms.domain.prompt_dto import SummaryOutputClass, KeywordsOutputClass
from allms.domain.response import ResponseData
from allms.utils.long_text_processing_utils import get_max_allowed_number_of_tokens
from allms.utils.response_parsing_utils import ResponseParser

logger = logging.getLogger(__name__)

Expand All @@ -58,6 +58,7 @@ def __init__(
self._is_long_text_bypass_enabled: bool = False # Should be false till we fully implement support for long sequences in our package
self._aggregation_strategy: AggregationLogicForLongInputData = AggregationLogicForLongInputData.SIMPLE_CONCATENATION
self._parser: typing.Optional[PydanticOutputParser] = None
self._json_pattern = re.compile(r"{.*?}", re.DOTALL)

if max_output_tokens >= model_total_max_tokens:
raise ValueError("max_output_tokens has to be lower than model_total_max_tokens")
Expand Down Expand Up @@ -103,38 +104,9 @@ def generate(
)

if output_data_model_class:
return self._parse_model_output(model_responses)
return ResponseParser(self._parser).parse_model_output(model_responses)
return model_responses

def _parse_response(self, model_response_data: ResponseData) -> typing.Tuple[str, typing.Optional[str]]:
try:
return self._parser.parse(model_response_data.response), None
except OutputParserException as output_parser_exception:
return None, OutputParserException(
f"An OutputParserException has occurred for "
f"The response from model: {model_response_data.response}\n"
f"The exception message: {output_parser_exception}"
)

def _parse_model_output(self, model_responses_data: typing.List[ResponseData]) -> typing.List[ResponseData]:
parsed_responses = []
for model_response_data in model_responses_data:
if not model_response_data.error:
response, error_message = self._parse_response(model_response_data)

parsed_responses.append(ResponseData(
input_data=model_response_data.input_data,
response=response,
error=error_message,
number_of_prompt_tokens=model_response_data.number_of_prompt_tokens,
number_of_generated_tokens=model_response_data.number_of_generated_tokens

))
else:
parsed_responses.append(model_response_data)

return parsed_responses

async def _generate(
self,
prompt: str,
Expand Down
70 changes: 70 additions & 0 deletions allms/utils/response_parsing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import re
import typing

from langchain.output_parsers import PydanticOutputParser
from langchain.schema import OutputParserException

from allms.domain.response import ResponseData, ResponseParsingOutput


class ResponseParser:
def __init__(self, parser: PydanticOutputParser) -> None:
self._json_pattern = re.compile(r"{.*?}", re.DOTALL)
self._parser = parser

def _clean_extracted_json(self, extracted_json: str) -> str:
json_without_newlines = extracted_json.replace("\\n", "")
json_without_backslashes = json_without_newlines.replace("\\", "")

return json_without_backslashes

def _extract_json_from_response(self, model_response_data: ResponseData) -> str:
search_results = self._json_pattern.findall(model_response_data.response)

if len(search_results) == 0:
return model_response_data.response

return self._clean_extracted_json(search_results[0])

def _parse_response(
self,
model_response_data: ResponseData
) -> ResponseParsingOutput:
raw_response = self._extract_json_from_response(model_response_data)

try:
return ResponseParsingOutput(
response=self._parser.parse(raw_response),
error_message=None
)
except OutputParserException as output_parser_exception:
return ResponseParsingOutput(
response=None,
error_message=f"""
An OutputParserException has occurred for the model response: {raw_response}
The exception message: {output_parser_exception}
"""
)

def parse_model_output(
self,
model_responses_data: typing.List[ResponseData]
) -> typing.List[ResponseData]:
parsed_responses = []

for model_response_data in model_responses_data:
if not model_response_data.error:
response_with_error = self._parse_response(model_response_data)

parsed_responses.append(ResponseData(
input_data=model_response_data.input_data,
response=response_with_error.response,
error=response_with_error.error_message,
number_of_prompt_tokens=model_response_data.number_of_prompt_tokens,
number_of_generated_tokens=model_response_data.number_of_generated_tokens

))
else:
parsed_responses.append(model_response_data)

return parsed_responses
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "allms"
version = "1.0.1"
version = "1.0.2"
description = ""
authors = ["Allegro Opensource <[email protected]>"]
readme = "README.md"
Expand Down
42 changes: 40 additions & 2 deletions tests/test_output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import patch

from langchain.schema import OutputParserException
import pytest

from allms.domain.input_data import InputData
from allms.domain.prompt_dto import SummaryOutputClass, KeywordsOutputClass
Expand Down Expand Up @@ -41,7 +42,44 @@ def test_output_parser_returns_error_when_model_output_returns_different_field(s
# WHEN & THEN
for model in models.values():
model_response = model.generate(prompt, input_data, SummaryOutputClass)
assert type(model_response[0].error) == OutputParserException
assert "OutputParserException" in model_response[0].error
assert model_response[0].response is None

@patch("langchain.chains.base.Chain.arun")
@patch("langchain_community.llms.vertexai.VertexAI.get_num_tokens")
@pytest.mark.parametrize("json_response", [
("{\"summary\": \"This is the model output\"}"),
("Sure! Here's the JSON you wanted: {\"summary\": \"This is the model output\"} Have a nice day!"),
("<<SYS>>\\n{\\n \"summary\": \"This is the model output\"\\n}\\n<</SYS>>"),
("{\\\"summary\\\": \\\"This is the model output\\\"}\\n}")
])
def test_output_parser_extracts_json_from_response(self, tokens_mock, chain_run_mock, models, json_response):
# GIVEN
chain_run_mock.return_value = json_response
tokens_mock.return_value = 1

input_data = [InputData(input_mappings={"text": "Some dummy text"}, id="1")]
prompt = "Some Dummy Prompt {text}"

# WHEN & THEN
for model in models.values():
model_response = model.generate(prompt, input_data, SummaryOutputClass)
assert model_response[0].response == SummaryOutputClass(summary="This is the model output")

@patch("langchain.chains.base.Chain.arun")
@patch("langchain_community.llms.vertexai.VertexAI.get_num_tokens")
def test_output_parser_returns_error_when_json_is_garbled(self, tokens_mock, chain_run_mock, models):
riccardo-alle marked this conversation as resolved.
Show resolved Hide resolved
# GIVEN
chain_run_mock.return_value = "Sure! Here's the JSON you wanted: {\"summary: \"text\"}"
tokens_mock.return_value = 1

input_data = [InputData(input_mappings={"text": "Some dummy text"}, id="1")]
prompt = "Some Dummy Prompt {text}"

# WHEN & THEN
for model in models.values():
model_response = model.generate(prompt, input_data, SummaryOutputClass)
assert "OutputParserException" in model_response[0].error
assert model_response[0].response is None

@patch("langchain.chains.base.Chain.arun")
Expand Down Expand Up @@ -94,4 +132,4 @@ def test_model_output_when_input_data_is_empty(self, tokens_mock, chain_run_mock
for model in models.values():
model_response = model.generate(prompt, None, KeywordsOutputClass)
assert model_response[0].response is None
assert type(model_response[0].error) == OutputParserException
assert "OutputParserException" in model_response[0].error
Loading