diff --git a/instill/helpers/__init__.py b/instill/helpers/__init__.py
index bc52916e..c34038e9 100644
--- a/instill/helpers/__init__.py
+++ b/instill/helpers/__init__.py
@@ -1,15 +1,26 @@
-from instill.helpers.protobufs.parse import (
-    Metadata,
-    construct_image_to_image_infer_response,
-    construct_image_to_image_metadata_response,
-    construct_infer_response,
-    construct_metadata_response,
-    construct_text_generation_chat_infer_response,
-    construct_text_generation_chat_metadata_response,
-    construct_text_generation_infer_response,
-    construct_text_generation_metadata_response,
-    construct_text_to_image_infer_response,
-    construct_text_to_image_metadata_response,
-    construct_visual_question_answering_infer_response,
-    construct_visual_question_answering_metadata_response,
+# pylint: disable=no-name-in-module
+from instill.helpers.protobufs.ray_pb2 import TriggerRequest, TriggerResponse
+from instill.helpers.ray_io import (
+    construct_task_classification_output,
+    construct_task_detection_output,
+    construct_task_image_to_image_output,
+    construct_task_instance_segmentation_output,
+    construct_task_keypoint_output,
+    construct_task_ocr_output,
+    construct_task_semantic_segmentation_output,
+    construct_task_text_generation_chat_output,
+    construct_task_text_generation_output,
+    construct_task_text_to_image_output,
+    construct_task_visual_question_answering_output,
+    parse_task_classification_to_vision_input,
+    parse_task_detection_to_vision_input,
+    parse_task_image_to_image_input,
+    parse_task_instance_segmentation_to_vision_input,
+    parse_task_keypoint_to_vision_input,
+    parse_task_ocr_to_vision_input,
+    parse_task_semantic_segmentation_to_vision_input,
+    parse_task_text_generation_chat_to_conversation_input,
+    parse_task_text_generation_to_conversation_input,
+    parse_task_text_to_image_input,
+    parse_task_visual_question_answering_to_conversation_multimodal_input,
 )
diff --git a/instill/helpers/const.py b/instill/helpers/const.py
index 644ba03b..e1bedf13 100644
--- a/instill/helpers/const.py
+++ b/instill/helpers/const.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+PROMPT_ROLES = ["user", "assistant", "system"]
+
 
 class DataType(Enum):
     TYPE_BOOL = 1
@@ -21,63 +23,49 @@ class DataType(Enum):
     TYPE_STRING = 13
 
 
-class TextGenerationInput:
-    prompt = ""
+class VisionInput:
+    image: np.ndarray
+
+
+class ConversationInput:
+    conversation: List[Dict[str, str]]
+    max_new_tokens = 100
+    temperature = 0.8
+    top_k = 1
+    seed = 0
+    stop_words: Any = ""  # Optional
+    extra_params: Dict[str, str] = {}
+
+
+class ConversationMultiModelInput:
+    conversation: List[Union[Dict[str, Union[str, Dict[str, str]]]]]
     prompt_images: Union[List[np.ndarray], None] = None
-    chat_history: Union[List[str], None] = None
-    system_message: Union[str, None] = None
     max_new_tokens = 100
     temperature = 0.8
     top_k = 1
-    random_seed = 0
+    seed = 0
     stop_words: Any = ""  # Optional
     extra_params: Dict[str, str] = {}
 
 
 class TextToImageInput:
-    prompt_image: Union[np.ndarray, None] = None
     prompt = ""
-    negative_prompt = ""
     steps = 5
-    guidance_scale = 7.5
+    cfg_scale = 7.5
     seed = 0
     samples = 1
     extra_params: Dict[str, str] = {}
 
 
 class ImageToImageInput:
-    prompt_image: Union[np.ndarray, None] = None
     prompt = ""
+    prompt_image: Union[np.ndarray, None] = None
     steps = 5
-    guidance_scale = 7.5
+    cfg_scale = 7.5
     seed = 0
     samples = 1
-    extra_params: Dict[str, str] = {}
-
-
-class TextGenerationChatInput:
-    prompt = ""
-    prompt_images: Union[List[np.ndarray], None] = None
-    chat_history: Union[List[str], None] = None
-    system_message: Union[str, None] = None
-    max_new_tokens = 100
-    temperature = 0.8
-    top_k = 1
-    random_seed = 0
-    stop_words: Any = ""  # Optional
-    extra_params: Dict[str, str] = {}
-
-
-class VisualQuestionAnsweringInput:
-    prompt = ""
-    prompt_images: Union[List[np.ndarray], None] = None
-    chat_history: Union[List[str], None] = None
-    system_message: Union[str, None] = None
-    max_new_tokens = 100
-    temperature = 0.8
-    top_k = 1
-    random_seed = 0
-    stop_words: Any = ""  # Optional
+    low_threshold = 100
+    high_threshold = 200
     extra_params: Dict[str, str] = {}
 
 
diff --git a/instill/helpers/errors.py b/instill/helpers/errors.py
index 03023bc2..e6ade39f 100644
--- a/instill/helpers/errors.py
+++ b/instill/helpers/errors.py
@@ -16,3 +16,13 @@ def __str__(
         self,
     ) -> str:
         return f"model config file `instill.yaml` is missing {self.field} field"
+
+
+class InvalidInputException(Exception):
+    def __str__(self) -> str:
+        return "trigger request input error"
+
+
+class InvalidOutputShapeException(Exception):
+    def __str__(self) -> str:
+        return "outputs length not matched"
diff --git a/instill/helpers/protobufs/parse.py b/instill/helpers/protobufs/parse.py
deleted file mode 100644
index 07ff568d..00000000
--- a/instill/helpers/protobufs/parse.py
+++ /dev/null
@@ -1,479 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-
-from instill.helpers.protobufs.ray_pb2 import (
-    ModelMetadataRequest,
-    ModelMetadataResponse,
-    RayServiceCallRequest,
-    RayServiceCallResponse,
-    InferTensor,
-)
-
-from instill.helpers.const import DataType
-
-
-@dataclass
-class Metadata:
-    name: str
-    datatype: str
-    shape: list
-
-
-def construct_metadata_response(
-    req: ModelMetadataRequest,
-    inputs: List[Metadata],
-    outputs: List[Metadata],
-) -> ModelMetadataResponse:
-    resp = ModelMetadataResponse(
-        name=req.name,
-        versions=req.version,
-        framework="python",
-        inputs=[],
-        outputs=[],
-    )
-
-    for i in inputs:
-        resp.inputs.append(
-            ModelMetadataResponse.TensorMetadata(
-                name=i.name,
-                datatype=i.datatype,
-                shape=i.shape,
-            )
-        )
-
-    for o in outputs:
-        resp.outputs.append(
-            ModelMetadataResponse.TensorMetadata(
-                name=o.name,
-                datatype=o.datatype,
-                shape=o.shape,
-            )
-        )
-
-    return resp
-
-
-def construct_infer_response(
-    req: RayServiceCallRequest,
-    outputs: List[Metadata],
-    raw_outputs: List[bytes],
-) -> RayServiceCallResponse:
-    resp = RayServiceCallResponse(
-        model_name=req.model_name,
-        model_version=req.model_version,
-        outputs=[],
-        raw_output_contents=[],
-    )
-
-    for o in outputs:
-        resp.outputs.append(
-            InferTensor(
-                name=o.name,
-                datatype=o.datatype,
-                shape=o.shape,
-            )
-        )
-
-    resp.raw_output_contents.extend(raw_outputs)
-
-    return resp
-
-
-def construct_text_generation_metadata_response(
-    req: ModelMetadataRequest,
-) -> ModelMetadataResponse:
-    return construct_metadata_response(
-        req=req,
-        inputs=[
-            Metadata(
-                name="prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="prompt_images",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="chat_history",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="system_message",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="max_new_tokens",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="temperature",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="top_k",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="random_seed",
-                datatype=str(DataType.TYPE_UINT64.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="extra_params",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-        ],
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[-1, -1],
-            ),
-        ],
-    )
-
-
-def construct_text_generation_infer_response(
-    req: RayServiceCallRequest,
-    shape: list,
-    raw_outputs: List[bytes],
-):
-    return construct_infer_response(
-        req=req,
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=shape,
-            )
-        ],
-        raw_outputs=raw_outputs,
-    )
-
-
-def construct_text_generation_chat_metadata_response(
-    req: ModelMetadataRequest,
-) -> ModelMetadataResponse:
-    return construct_metadata_response(
-        req=req,
-        inputs=[
-            Metadata(
-                name="prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="prompt_images",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="chat_history",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="system_message",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="max_new_tokens",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="temperature",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="top_k",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="random_seed",
-                datatype=str(DataType.TYPE_UINT64.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="extra_params",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-        ],
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[-1, -1],
-            ),
-        ],
-    )
-
-
-def construct_text_generation_chat_infer_response(
-    req: RayServiceCallRequest,
-    shape: list,
-    raw_outputs: List[bytes],
-):
-    return construct_infer_response(
-        req=req,
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=shape,
-            )
-        ],
-        raw_outputs=raw_outputs,
-    )
-
-
-def construct_text_to_image_metadata_response(
-    req: ModelMetadataRequest,
-):
-    return construct_metadata_response(
-        req=req,
-        inputs=[
-            Metadata(
-                name="prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="negative_prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="prompt_image",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="samples",
-                datatype=str(DataType.TYPE_INT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="scheduler",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="steps",
-                datatype=str(DataType.TYPE_INT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="guidance_scale",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="seed",
-                datatype=str(DataType.TYPE_INT64.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="extra_params",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-        ],
-        outputs=[
-            Metadata(
-                name="images",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[-1, -1, -1, -1],
-            ),
-        ],
-    )
-
-
-def construct_text_to_image_infer_response(
-    req: RayServiceCallRequest,
-    shape: list,
-    raw_outputs: List[bytes],
-):
-    return construct_infer_response(
-        req=req,
-        outputs=[
-            Metadata(
-                name="images",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=shape,
-            )
-        ],
-        raw_outputs=raw_outputs,
-    )
-
-
-def construct_image_to_image_metadata_response(
-    req: ModelMetadataRequest,
-):
-    return construct_metadata_response(
-        req=req,
-        inputs=[
-            Metadata(
-                name="prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="negative_prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="prompt_image",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="samples",
-                datatype=str(DataType.TYPE_INT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="scheduler",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="steps",
-                datatype=str(DataType.TYPE_INT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="guidance_scale",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="seed",
-                datatype=str(DataType.TYPE_INT64.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="extra_params",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-        ],
-        outputs=[
-            Metadata(
-                name="images",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[-1, -1, -1, -1],
-            ),
-        ],
-    )
-
-
-def construct_image_to_image_infer_response(
-    req: RayServiceCallRequest,
-    shape: list,
-    raw_outputs: List[bytes],
-):
-    return construct_infer_response(
-        req=req,
-        outputs=[
-            Metadata(
-                name="images",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=shape,
-            )
-        ],
-        raw_outputs=raw_outputs,
-    )
-
-
-def construct_visual_question_answering_metadata_response(
-    req: ModelMetadataRequest,
-):
-    return construct_metadata_response(
-        req=req,
-        inputs=[
-            Metadata(
-                name="prompt",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="prompt_images",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="chat_history",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="system_message",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="max_new_tokens",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="temperature",
-                datatype=str(DataType.TYPE_FP32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="top_k",
-                datatype=str(DataType.TYPE_UINT32.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="random_seed",
-                datatype=str(DataType.TYPE_UINT64.name),
-                shape=[1],
-            ),
-            Metadata(
-                name="extra_params",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[1],
-            ),
-        ],
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=[-1, -1],
-            ),
-        ],
-    )
-
-
-def construct_visual_question_answering_infer_response(
-    req: RayServiceCallRequest,
-    shape: list,
-    raw_outputs: List[bytes],
-):
-    return construct_infer_response(
-        req=req,
-        outputs=[
-            Metadata(
-                name="text",
-                datatype=str(DataType.TYPE_STRING.name),
-                shape=shape,
-            )
-        ],
-        raw_outputs=raw_outputs,
-    )
diff --git a/instill/helpers/protobufs/ray_pb2.py b/instill/helpers/protobufs/ray_pb2.py
index 0fbd8150..db5a5a82 100644
--- a/instill/helpers/protobufs/ray_pb2.py
+++ b/instill/helpers/protobufs/ray_pb2.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: ray.proto
+# source: ray/v1/ray.proto
+# Protobuf Python Version: 4.25.1
 """Generated protocol buffer code."""
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
@@ -11,30 +12,21 @@
 _sym_db = _symbol_database.Default()
 
 
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\tray.proto\x12\tray.serve\"5\n\x14ModelMetadataRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\"<\n\x0bInferTensor\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tatype\x18\x02 \x01(\t\x12\r\n\x05shape\x18\x03 \x03(\x03\"\x8e\x02\n\x15ModelMetadataResponse\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\t\x12\x11\n\tframework\x18\x03 \x01(\t\x12?\n\x06inputs\x18\x04 \x03(\x0b\x32/.ray.serve.ModelMetadataResponse.TensorMetadata\x12@\n\x07outputs\x18\x05 \x03(\x0b\x32/.ray.serve.ModelMetadataResponse.TensorMetadata\x1a?\n\x0eTensorMetadata\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64\x61tatype\x18\x02 \x01(\t\x12\r\n\x05shape\x18\x03 \x03(\x03\"\x80\x02\n\x15RayServiceCallRequest\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\t\x12&\n\x06inputs\x18\x03 \x03(\x0b\x32\x16.ray.serve.InferTensor\x12L\n\x07outputs\x18\x06 \x03(\x0b\x32;.ray.serve.RayServiceCallRequest.InferRequestedOutputTensor\x12\x1a\n\x12raw_input_contents\x18\x07 \x03(\x0c\x1a*\n\x1aInferRequestedOutputTensor\x12\x0c\n\x04name\x18\x01 \x01(\t\"\x89\x01\n\x16RayServiceCallResponse\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\t\x12\'\n\x07outputs\x18\x05 \x03(\x0b\x32\x16.ray.serve.InferTensor\x12\x1b\n\x13raw_output_contents\x18\x06 \x03(\x0c\x32\xb5\x01\n\nRayService\x12T\n\rModelMetadata\x12\x1f.ray.serve.ModelMetadataRequest\x1a .ray.serve.ModelMetadataResponse\"\x00\x12Q\n\x08__call__\x12 .ray.serve.RayServiceCallRequest\x1a!.ray.serve.RayServiceCallResponse\"\x00\x42\x07Z\x05./rayb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10ray/v1/ray.proto\x12\x06ray.v1\x1a\x1cgoogle/protobuf/struct.proto\">\n\x0eTriggerRequest\x12,\n\x0btask_inputs\x18\x01 \x03(\x0b\x32\x17.google.protobuf.Struct\"@\n\x0fTriggerResponse\x12-\n\x0ctask_outputs\x18\x01 \x03(\x0b\x32\x17.google.protobuf.Struct2J\n\nRayService\x12<\n\x07Trigger\x12\x16.ray.v1.TriggerRequest\x1a\x17.ray.v1.TriggerResponse\"\x00\x42\rZ\x0b./rayserverb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'ray_pb2', _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'ray.v1.ray_pb2', _globals)
 if _descriptor._USE_C_DESCRIPTORS == False:
-  DESCRIPTOR._options = None
-  DESCRIPTOR._serialized_options = b'Z\005./ray'
-  _globals['_MODELMETADATAREQUEST']._serialized_start=24
-  _globals['_MODELMETADATAREQUEST']._serialized_end=77
-  _globals['_INFERTENSOR']._serialized_start=79
-  _globals['_INFERTENSOR']._serialized_end=139
-  _globals['_MODELMETADATARESPONSE']._serialized_start=142
-  _globals['_MODELMETADATARESPONSE']._serialized_end=412
-  _globals['_MODELMETADATARESPONSE_TENSORMETADATA']._serialized_start=349
-  _globals['_MODELMETADATARESPONSE_TENSORMETADATA']._serialized_end=412
-  _globals['_RAYSERVICECALLREQUEST']._serialized_start=415
-  _globals['_RAYSERVICECALLREQUEST']._serialized_end=671
-  _globals['_RAYSERVICECALLREQUEST_INFERREQUESTEDOUTPUTTENSOR']._serialized_start=629
-  _globals['_RAYSERVICECALLREQUEST_INFERREQUESTEDOUTPUTTENSOR']._serialized_end=671
-  _globals['_RAYSERVICECALLRESPONSE']._serialized_start=674
-  _globals['_RAYSERVICECALLRESPONSE']._serialized_end=811
-  _globals['_RAYSERVICE']._serialized_start=814
-  _globals['_RAYSERVICE']._serialized_end=995
+  _globals['DESCRIPTOR']._options = None
+  _globals['DESCRIPTOR']._serialized_options = b'Z\013./rayserver'
+  _globals['_TRIGGERREQUEST']._serialized_start=58
+  _globals['_TRIGGERREQUEST']._serialized_end=120
+  _globals['_TRIGGERRESPONSE']._serialized_start=122
+  _globals['_TRIGGERRESPONSE']._serialized_end=186
+  _globals['_RAYSERVICE']._serialized_start=188
+  _globals['_RAYSERVICE']._serialized_end=262
 # @@protoc_insertion_point(module_scope)
diff --git a/instill/helpers/protobufs/ray_pb2.pyi b/instill/helpers/protobufs/ray_pb2.pyi
index 4a67105b..cab0b3ac 100644
--- a/instill/helpers/protobufs/ray_pb2.pyi
+++ b/instill/helpers/protobufs/ray_pb2.pyi
@@ -1,3 +1,4 @@
+from google.protobuf import struct_pb2 as _struct_pb2
 from google.protobuf.internal import containers as _containers
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
@@ -5,74 +6,14 @@ from typing import ClassVar as _ClassVar, Iterable as _Iterable, Mapping as _Map
 
 DESCRIPTOR: _descriptor.FileDescriptor
 
-class ModelMetadataRequest(_message.Message):
-    __slots__ = ["name", "version"]
-    NAME_FIELD_NUMBER: _ClassVar[int]
-    VERSION_FIELD_NUMBER: _ClassVar[int]
-    name: str
-    version: str
-    def __init__(self, name: _Optional[str] = ..., version: _Optional[str] = ...) -> None: ...
+class TriggerRequest(_message.Message):
+    __slots__ = ("task_inputs",)
+    TASK_INPUTS_FIELD_NUMBER: _ClassVar[int]
+    task_inputs: _containers.RepeatedCompositeFieldContainer[_struct_pb2.Struct]
+    def __init__(self, task_inputs: _Optional[_Iterable[_Union[_struct_pb2.Struct, _Mapping]]] = ...) -> None: ...
 
-class InferTensor(_message.Message):
-    __slots__ = ["name", "datatype", "shape"]
-    NAME_FIELD_NUMBER: _ClassVar[int]
-    DATATYPE_FIELD_NUMBER: _ClassVar[int]
-    SHAPE_FIELD_NUMBER: _ClassVar[int]
-    name: str
-    datatype: str
-    shape: _containers.RepeatedScalarFieldContainer[int]
-    def __init__(self, name: _Optional[str] = ..., datatype: _Optional[str] = ..., shape: _Optional[_Iterable[int]] = ...) -> None: ...
-
-class ModelMetadataResponse(_message.Message):
-    __slots__ = ["name", "versions", "framework", "inputs", "outputs"]
-    class TensorMetadata(_message.Message):
-        __slots__ = ["name", "datatype", "shape"]
-        NAME_FIELD_NUMBER: _ClassVar[int]
-        DATATYPE_FIELD_NUMBER: _ClassVar[int]
-        SHAPE_FIELD_NUMBER: _ClassVar[int]
-        name: str
-        datatype: str
-        shape: _containers.RepeatedScalarFieldContainer[int]
-        def __init__(self, name: _Optional[str] = ..., datatype: _Optional[str] = ..., shape: _Optional[_Iterable[int]] = ...) -> None: ...
-    NAME_FIELD_NUMBER: _ClassVar[int]
-    VERSIONS_FIELD_NUMBER: _ClassVar[int]
-    FRAMEWORK_FIELD_NUMBER: _ClassVar[int]
-    INPUTS_FIELD_NUMBER: _ClassVar[int]
-    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
-    name: str
-    versions: _containers.RepeatedScalarFieldContainer[str]
-    framework: str
-    inputs: _containers.RepeatedCompositeFieldContainer[ModelMetadataResponse.TensorMetadata]
-    outputs: _containers.RepeatedCompositeFieldContainer[ModelMetadataResponse.TensorMetadata]
-    def __init__(self, name: _Optional[str] = ..., versions: _Optional[_Iterable[str]] = ..., framework: _Optional[str] = ..., inputs: _Optional[_Iterable[_Union[ModelMetadataResponse.TensorMetadata, _Mapping]]] = ..., outputs: _Optional[_Iterable[_Union[ModelMetadataResponse.TensorMetadata, _Mapping]]] = ...) -> None: ...
-
-class RayServiceCallRequest(_message.Message):
-    __slots__ = ["model_name", "model_version", "inputs", "outputs", "raw_input_contents"]
-    class InferRequestedOutputTensor(_message.Message):
-        __slots__ = ["name"]
-        NAME_FIELD_NUMBER: _ClassVar[int]
-        name: str
-        def __init__(self, name: _Optional[str] = ...) -> None: ...
-    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
-    MODEL_VERSION_FIELD_NUMBER: _ClassVar[int]
-    INPUTS_FIELD_NUMBER: _ClassVar[int]
-    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
-    RAW_INPUT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
-    model_name: str
-    model_version: str
-    inputs: _containers.RepeatedCompositeFieldContainer[InferTensor]
-    outputs: _containers.RepeatedCompositeFieldContainer[RayServiceCallRequest.InferRequestedOutputTensor]
-    raw_input_contents: _containers.RepeatedScalarFieldContainer[bytes]
-    def __init__(self, model_name: _Optional[str] = ..., model_version: _Optional[str] = ..., inputs: _Optional[_Iterable[_Union[InferTensor, _Mapping]]] = ..., outputs: _Optional[_Iterable[_Union[RayServiceCallRequest.InferRequestedOutputTensor, _Mapping]]] = ..., raw_input_contents: _Optional[_Iterable[bytes]] = ...) -> None: ...
-
-class RayServiceCallResponse(_message.Message):
-    __slots__ = ["model_name", "model_version", "outputs", "raw_output_contents"]
-    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
-    MODEL_VERSION_FIELD_NUMBER: _ClassVar[int]
-    OUTPUTS_FIELD_NUMBER: _ClassVar[int]
-    RAW_OUTPUT_CONTENTS_FIELD_NUMBER: _ClassVar[int]
-    model_name: str
-    model_version: str
-    outputs: _containers.RepeatedCompositeFieldContainer[InferTensor]
-    raw_output_contents: _containers.RepeatedScalarFieldContainer[bytes]
-    def __init__(self, model_name: _Optional[str] = ..., model_version: _Optional[str] = ..., outputs: _Optional[_Iterable[_Union[InferTensor, _Mapping]]] = ..., raw_output_contents: _Optional[_Iterable[bytes]] = ...) -> None: ...
+class TriggerResponse(_message.Message):
+    __slots__ = ("task_outputs",)
+    TASK_OUTPUTS_FIELD_NUMBER: _ClassVar[int]
+    task_outputs: _containers.RepeatedCompositeFieldContainer[_struct_pb2.Struct]
+    def __init__(self, task_outputs: _Optional[_Iterable[_Union[_struct_pb2.Struct, _Mapping]]] = ...) -> None: ...
diff --git a/instill/helpers/protobufs/ray_pb2_grpc.py b/instill/helpers/protobufs/ray_pb2_grpc.py
index cb1dec7b..7206243f 100644
--- a/instill/helpers/protobufs/ray_pb2_grpc.py
+++ b/instill/helpers/protobufs/ray_pb2_grpc.py
@@ -2,7 +2,7 @@
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 
-import ray_pb2 as ray__pb2
+import ray_pb2 as ray_dot_v1_dot_ray__pb2
 
 
 class RayServiceStub(object):
@@ -15,15 +15,10 @@ def __init__(self, channel):
         Args:
             channel: A grpc.Channel.
         """
-        self.ModelMetadata = channel.unary_unary(
-                '/ray.serve.RayService/ModelMetadata',
-                request_serializer=ray__pb2.ModelMetadataRequest.SerializeToString,
-                response_deserializer=ray__pb2.ModelMetadataResponse.FromString,
-                )
-        self.__call__ = channel.unary_unary(
-                '/ray.serve.RayService/__call__',
-                request_serializer=ray__pb2.RayServiceCallRequest.SerializeToString,
-                response_deserializer=ray__pb2.RayServiceCallResponse.FromString,
+        self.Trigger = channel.unary_unary(
+                '/ray.v1.RayService/Trigger',
+                request_serializer=ray_dot_v1_dot_ray__pb2.TriggerRequest.SerializeToString,
+                response_deserializer=ray_dot_v1_dot_ray__pb2.TriggerResponse.FromString,
                 )
 
 
@@ -31,16 +26,8 @@ class RayServiceServicer(object):
     """Ray service for internal process
     """
 
-    def ModelMetadata(self, request, context):
-        """ModelMetadata method receives a ModelMetadataRequest message and
-        returns a ModelMetadataResponse
-        """
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def __call__(self, request, context):
-        """__call__ method is the defaut trigger entry for ray deployment
+    def Trigger(self, request, context):
+        """Trigger method is the defaut trigger entry for ray deployment
         """
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
@@ -49,19 +36,14 @@ def __call__(self, request, context):
 
 def add_RayServiceServicer_to_server(servicer, server):
     rpc_method_handlers = {
-            'ModelMetadata': grpc.unary_unary_rpc_method_handler(
-                    servicer.ModelMetadata,
-                    request_deserializer=ray__pb2.ModelMetadataRequest.FromString,
-                    response_serializer=ray__pb2.ModelMetadataResponse.SerializeToString,
-            ),
-            '__call__': grpc.unary_unary_rpc_method_handler(
-                    servicer.__call__,
-                    request_deserializer=ray__pb2.RayServiceCallRequest.FromString,
-                    response_serializer=ray__pb2.RayServiceCallResponse.SerializeToString,
+            'Trigger': grpc.unary_unary_rpc_method_handler(
+                    servicer.Trigger,
+                    request_deserializer=ray_dot_v1_dot_ray__pb2.TriggerRequest.FromString,
+                    response_serializer=ray_dot_v1_dot_ray__pb2.TriggerResponse.SerializeToString,
             ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
-            'ray.serve.RayService', rpc_method_handlers)
+            'ray.v1.RayService', rpc_method_handlers)
     server.add_generic_rpc_handlers((generic_handler,))
 
 
@@ -71,24 +53,7 @@ class RayService(object):
     """
 
     @staticmethod
-    def ModelMetadata(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/ray.serve.RayService/ModelMetadata',
-            ray__pb2.ModelMetadataRequest.SerializeToString,
-            ray__pb2.ModelMetadataResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def __call__(request,
+    def Trigger(request,
             target,
             options=(),
             channel_credentials=None,
@@ -98,8 +63,8 @@ def __call__(request,
             wait_for_ready=None,
             timeout=None,
             metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/ray.serve.RayService/__call__',
-            ray__pb2.RayServiceCallRequest.SerializeToString,
-            ray__pb2.RayServiceCallResponse.FromString,
+        return grpc.experimental.unary_unary(request, target, '/ray.v1.RayService/Trigger',
+            ray_dot_v1_dot_ray__pb2.TriggerRequest.SerializeToString,
+            ray_dot_v1_dot_ray__pb2.TriggerResponse.FromString,
             options, channel_credentials,
             insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/instill/helpers/ray_io.py b/instill/helpers/ray_io.py
index c63f7888..5de4ba66 100644
--- a/instill/helpers/ray_io.py
+++ b/instill/helpers/ray_io.py
@@ -1,706 +1,1035 @@
+# pylint: disable=no-member,no-name-in-module, inconsistent-return-statements
 import base64
 import io
-import json
-import struct
-from json.decoder import JSONDecodeError
-from typing import List
+import re
+from typing import Dict, List, Union
 
-import numpy as np
+import requests
+from google.protobuf import json_format, struct_pb2
 from PIL import Image
 
+import instill.protogen.model.model.v1alpha.common_pb2 as commonpb
+import instill.protogen.model.model.v1alpha.model_pb2 as modelpb
+import instill.protogen.model.model.v1alpha.task_classification_pb2 as classificationpb
+import instill.protogen.model.model.v1alpha.task_detection_pb2 as detectionpb
+import instill.protogen.model.model.v1alpha.task_image_to_image_pb2 as imagetoimagepb
+import instill.protogen.model.model.v1alpha.task_instance_segmentation_pb2 as instancesegmentationpb
+import instill.protogen.model.model.v1alpha.task_keypoint_pb2 as keypointpb
+import instill.protogen.model.model.v1alpha.task_ocr_pb2 as ocrpb
+import instill.protogen.model.model.v1alpha.task_semantic_segmentation_pb2 as semanticsegmentationpb
+import instill.protogen.model.model.v1alpha.task_text_generation_chat_pb2 as textgenerationchatpb
+import instill.protogen.model.model.v1alpha.task_text_generation_pb2 as textgenerationpb
+import instill.protogen.model.model.v1alpha.task_text_to_image_pb2 as texttoimagepb
+import instill.protogen.model.model.v1alpha.task_visual_question_answering_pb2 as visualquestionansweringpb
 from instill.helpers.const import (
+    PROMPT_ROLES,
+    ConversationInput,
+    ConversationMultiModelInput,
     ImageToImageInput,
-    TextGenerationChatInput,
-    TextGenerationInput,
     TextToImageInput,
-    VisualQuestionAnsweringInput,
+    VisionInput,
 )
+from instill.helpers.errors import InvalidInputException, InvalidOutputShapeException
+from instill.helpers.protobufs.ray_pb2 import TriggerRequest, TriggerResponse
+
+
+def base64_to_pil_image(base64_str):
+    return Image.open(
+        io.BytesIO(
+            base64.b64decode(
+                re.sub(
+                    "^data:image/.+;base64,",
+                    "",
+                    base64_str,
+                )
+            )
+        )
+    )
+
+
+def url_to_pil_image(url):
+    resp = requests.get(url, timeout=10)
+    resp.raise_for_status()
+    return Image.open(io.BytesIO(resp.content))
+
+
+def protobuf_to_struct(pb_msg):
+    """Convert Protobuf message to Struct"""
+    dict_data = json_format.MessageToDict(pb_msg)
+
+    # Convert dictionary to struct_pb2.Struct
+    struct_pb = struct_pb2.Struct()
+    json_format.ParseDict(dict_data, struct_pb)
+
+    return struct_pb
+
+
+def struct_to_protobuf(struct_pb, pb_message_type):
+    """Convert Struct to Protobuf message"""
+    dict_data = json_format.MessageToDict(struct_pb)
+
+    # Parse dictionary to Protobuf message
+    pb_msg = pb_message_type()
+    json_format.ParseDict(dict_data, pb_msg)
+
+    return pb_msg
+
+
+def struct_to_dict(struct_obj):
+    """Convert Protobuf Struct to dictionary"""
+    if isinstance(struct_obj, struct_pb2.Struct):
+        return {k: struct_to_dict(v) for k, v in struct_obj.fields.items()}
+    if isinstance(struct_obj, struct_pb2.ListValue):
+        return [struct_to_dict(v) for v in struct_obj.values]
+    if isinstance(struct_obj, struct_pb2.Value):
+        kind = struct_obj.WhichOneof("kind")
+        if kind == "null_value":
+            return None
+        if kind == "number_value":
+            return struct_obj.number_value
+        if kind == "string_value":
+            return struct_obj.string_value
+        if kind == "bool_value":
+            return struct_obj.bool_value
+        if kind == "struct_value":
+            return struct_to_dict(struct_obj.struct_value)
+        if kind == "list_value":
+            return struct_to_dict(struct_obj.list_value)
+    else:
+        return struct_obj
+
+
+def parse_task_classification_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        classification_pb: classificationpb.ClassificationInput = (
+            task_input_pb.classification
+        )
+
+        inp = VisionInput()
+        if (
+            classification_pb.image_base64 != "" and classification_pb.image_url != ""
+        ) or (
+            classification_pb.image_base64 == "" and classification_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if classification_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(classification_pb.image_base64)
+        elif classification_pb.image_url != "":
+            inp.image = url_to_pil_image(classification_pb.image_url)
+
+        input_list.append(inp)
+
+    return input_list
+
+
+def construct_task_classification_output(
+    categories: List[str],
+    scores: List[float],
+) -> TriggerResponse:
+    if not len(categories) == len(scores):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for category, score in zip(categories, scores):
+
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    classification=classificationpb.ClassificationOutput(
+                        category=category, score=score
+                    )
+                )
+            )
+        )
 
+    return TriggerResponse(task_outputs=task_outputs)
 
-def serialize_byte_tensor(input_tensor):
-    """
-    Serializes a bytes tensor into a flat numpy array of length prepended
-    bytes. The numpy array should use dtype of np.object_. For np.bytes_,
-    numpy will remove trailing zeros at the end of byte sequence and because
-    of this it should be avoided.
-    Parameters
-    ----------
-    input_tensor : np.array
-        The bytes tensor to serialize.
-    Returns
-    -------
-    serialized_bytes_tensor : np.array
-        The 1-D numpy array of type uint8 containing the serialized bytes in 'C' order.
-    Raises
-    ------
-    InferenceServerException
-        If unable to serialize the given tensor.
-    """
 
-    if input_tensor.size == 0:
-        return ()
-
-    # If the input is a tensor of string/bytes objects, then must flatten those
-    # into a 1-dimensional array containing the 4-byte byte size followed by the
-    # actual element bytes. All elements are concatenated together in "C" order.
-    if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type == np.bytes_):
-        flattened_ls: list = []
-        for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
-            # If directly passing bytes to BYTES type,
-            # don't convert it to str as Python will encode the
-            # bytes which may distort the meaning
-            assert isinstance(obj, np.ndarray)
-            if input_tensor.dtype == np.object_:
-                if isinstance(obj.item(), bytes):
-                    s = obj.item()
-                else:
-                    s = str(obj.item()).encode("utf-8")
-            else:
-                s = obj.item()
-            flattened_ls.append(struct.pack("<I", len(s)))
-            flattened_ls.append(s)
-        flattened = b"".join(flattened_ls)
-        return flattened
-    return None
+def parse_task_detection_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
 
+        detection_pb: detectionpb.DetectionInput = task_input_pb.detection
 
-def deserialize_bytes_tensor(encoded_tensor):
-    """
-    Deserializes an encoded bytes tensor into an
-    numpy array of dtype of python objects
-
-    Parameters
-    ----------
-    encoded_tensor : bytes
-        The encoded bytes tensor where each element
-        has its length in first 4 bytes followed by
-        the content
-    Returns
-    -------
-    string_tensor : np.array
-        The 1-D numpy array of type object containing the
-        deserialized bytes in 'C' order.
+        inp = VisionInput()
+        if (detection_pb.image_base64 != "" and detection_pb.image_url != "") or (
+            detection_pb.image_base64 == "" and detection_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if detection_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(detection_pb.image_base64)
+        elif detection_pb.image_url != "":
+            inp.image = url_to_pil_image(detection_pb.image_url)
 
-    """
-    strs = []
-    offset = 0
-    val_buf = encoded_tensor
-    while offset < len(val_buf):
-        l = struct.unpack_from("<I", val_buf, offset)[0]
-        offset += 4
-        sb = struct.unpack_from(f"<{l}s", val_buf, offset)[0]
-        offset += l
-        strs.append(sb)
-    return np.array(strs, dtype=bytes)
-
-
-class StandardTaskIO:
-    @staticmethod
-    def parse_task_text_generation_input(request) -> TextGenerationInput:
-        text_generation_input = TextGenerationInput()
-
-        for i, b_input_tensor in zip(request.inputs, request.raw_input_contents):
-            input_name = i.name
-
-            if input_name == "prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_generation_input.prompt = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `prompt` type\
-                        ({type(text_generation_input.prompt)}): {text_generation_input.prompt}"
-                )
+        input_list.append(inp)
 
-            if input_name == "prompt_images":
-                input_tensors = deserialize_bytes_tensor(b_input_tensor)
-                images = []
-                for enc in input_tensors:
-                    if len(enc) == 0:
-                        continue
-                    try:
-                        enc_json = json.loads(str(enc.decode("utf-8")))
-                        if len(enc_json) == 0:
-                            continue
-                        decoded_enc = enc_json[0]
-                    except JSONDecodeError:
-                        print("[DEBUG] WARNING `enc_json` parsing faield!")
-                    # pil_img = Image.open(io.BytesIO(enc.astype(bytes)))  # RGB
-                    pil_img = Image.open(io.BytesIO(base64.b64decode(decoded_enc)))
-
-                    image = np.array(pil_img)
-                    if len(image.shape) == 2:  # gray image
-                        raise ValueError(
-                            f"The image shape with {image.shape} is "
-                            f"not in acceptable"
-                        )
-                    images.append(image)
-                # TODO: check wethere there are issues in batch size dimention
-                text_generation_input.prompt_images = images
-                print(
-                    "[DEBUG] input `prompt_images` type"
-                    f"({type(text_generation_input.prompt_images)}): "
-                    f"{text_generation_input.prompt_images}"
-                )
+    return input_list
 
-            if input_name == "chat_history":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                chat_history_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `chat_history_str` type"
-                    f"({type(chat_history_str)}): "
-                    f"{chat_history_str}"
-                )
-                try:
-                    text_generation_input.chat_history = json.loads(chat_history_str)
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
 
-            if input_name == "system_message":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_generation_input.system_message = str(
-                    input_tensor[0].decode("utf-8")
-                )
-                print(
-                    "[DEBUG] input `system_message` type"
-                    f"({type(text_generation_input.system_message)}): "
-                    f"{text_generation_input.system_message}"
-                )
+def construct_task_detection_output(
+    categories: List[List[str]],
+    scores: List[List[float]],
+    bounding_boxes: List[List[tuple]],
+) -> TriggerResponse:
+    """Construct trigger output for detection task
 
-            if input_name == "max_new_tokens":
-                text_generation_input.max_new_tokens = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `max_new_tokens` type"
-                    f"({type(text_generation_input.max_new_tokens)}): "
-                    f"{text_generation_input.max_new_tokens}"
+    Args:
+        categories (List[List[str]]): for each image input, the list of detected object's category
+        scores (List[List[float]]): for each image input, the list of detected object's score
+        bounding_boxes (List[List[tuple]]): for each image input, the list of detected object's bbox, with the format
+        (top, left, width, height)
+    """
+    if not len(categories) == len(scores) == len(bounding_boxes):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for category, score, bbox in zip(categories, scores, bounding_boxes):
+        objects = []
+        for cat, sc, bb in zip(category, score, bbox):
+            objects.append(
+                detectionpb.DetectionObject(
+                    category=cat,
+                    score=sc,
+                    bounding_box=commonpb.BoundingBox(
+                        top=bb[0],
+                        left=bb[1],
+                        width=bb[2],
+                        height=bb[3],
+                    ),
                 )
-
-            if input_name == "top_k":
-                text_generation_input.top_k = int.from_bytes(b_input_tensor, "little")
-                print(
-                    "[DEBUG] input `top_k` type"
-                    f"({type(text_generation_input.top_k)}): "
-                    f"{text_generation_input.top_k}"
+            )
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    detection=detectionpb.DetectionOutput(objects=objects)
                 )
+            )
+        )
 
-            if input_name == "temperature":
-                text_generation_input.temperature = struct.unpack("f", b_input_tensor)[
-                    0
-                ]
-                print(
-                    "[DEBUG] input `temperature` type"
-                    f"({type(text_generation_input.temperature)}): "
-                    f"{text_generation_input.temperature}"
-                )
-                text_generation_input.temperature = round(
-                    text_generation_input.temperature, 2
-                )
+    return TriggerResponse(task_outputs=task_outputs)
 
-            if input_name == "random_seed":
-                text_generation_input.random_seed = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `random_seed` type"
-                    f"({type(text_generation_input.random_seed)}): "
-                    f"{text_generation_input.random_seed}"
-                )
 
-            if input_name == "extra_params":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                extra_params_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `extra_params` type"
-                    f"({type(extra_params_str)}): "
-                    f"{extra_params_str}"
-                )
+def parse_task_ocr_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
 
-                try:
-                    text_generation_input.extra_params = json.loads(extra_params_str)
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+        ocr_pb: ocrpb.OcrInput = task_input_pb.ocr
 
-        return text_generation_input
+        inp = VisionInput()
+        if (ocr_pb.image_base64 != "" and ocr_pb.image_url != "") or (
+            ocr_pb.image_base64 == "" and ocr_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if ocr_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(ocr_pb.image_base64)
+        elif ocr_pb.image_url != "":
+            inp.image = url_to_pil_image(ocr_pb.image_url)
 
-    @staticmethod
-    def parse_task_text_generation_output(sequences: list):
-        text_outputs = [seq["generated_text"].encode("utf-8") for seq in sequences]
+        input_list.append(inp)
 
-        return serialize_byte_tensor(np.asarray(text_outputs))
+    return input_list
 
-    @staticmethod
-    def parse_task_text_to_image_input(request) -> TextToImageInput:
-        text_to_image_input = TextToImageInput()
 
-        for i, b_input_tensor in zip(request.inputs, request.raw_input_contents):
-            input_name = i.name
+def construct_task_ocr_output(
+    texts: List[List[str]],
+    scores: List[List[float]],
+    bounding_boxes: List[List[tuple]],
+) -> TriggerResponse:
+    """Construct trigger output for ocr task
 
-            if input_name == "prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_to_image_input.prompt = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `prompt` type\
-                        ({type(text_to_image_input.prompt)}): {text_to_image_input.prompt}"
+    Args:
+        texts (List[List[str]]): for each image input, the list of detected text
+        scores (List[List[float]]): for each image input, the list of detected text's score
+        bounding_boxes (List[List[tuple]]): for each image input, the list of detected text's bbox, with the format
+        (top, left, width, height)
+    """
+    if not len(texts) == len(scores) == len(bounding_boxes):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for text, score, bbox in zip(texts, scores, bounding_boxes):
+        objects = []
+        for txt, sc, bb in zip(text, score, bbox):
+            objects.append(
+                ocrpb.OcrObject(
+                    text=txt,
+                    score=sc,
+                    bounding_box=commonpb.BoundingBox(
+                        top=bb[0],
+                        left=bb[1],
+                        width=bb[2],
+                        height=bb[3],
+                    ),
                 )
-
-            if input_name == "negative_prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_to_image_input.negative_prompt = str(
-                    input_tensor[0].decode("utf-8")
+            )
+        task_outputs.append(
+            protobuf_to_struct(modelpb.TaskOutput(ocr=ocrpb.OcrOutput(objects=objects)))
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_instance_segmentation_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        instance_segmentation_pb: instancesegmentationpb.InstanceSegmentationInput = (
+            task_input_pb.instance_segmentation
+        )
+
+        inp = VisionInput()
+        if (
+            instance_segmentation_pb.image_base64 != ""
+            and instance_segmentation_pb.image_url != ""
+        ) or (
+            instance_segmentation_pb.image_base64 == ""
+            and instance_segmentation_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if instance_segmentation_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(
+                instance_segmentation_pb.image_base64,
+            )
+        elif instance_segmentation_pb.image_url != "":
+            inp.image = url_to_pil_image(instance_segmentation_pb.image_url)
+
+        input_list.append(inp)
+
+    return input_list
+
+
+def construct_task_instance_segmentation_output(
+    rles: List[List[str]],
+    categories: List[List[str]],
+    scores: List[List[float]],
+    bounding_boxes: List[List[tuple]],
+) -> TriggerResponse:
+    """Construct trigger output for instance segmentation task
+
+    Args:
+        rles (List[List[str]]): for each image input, the list of detected object's rle
+        categories (List[List[str]]): for each image input, the list of detected object's category
+        scores (List[List[float]]): for each image input, the list of detected text's score
+        bounding_boxes (List[List[tuple]]): for each image input, the list of detected text's bbox, with the format
+        (top, left, width, height)
+    """
+    if not len(rles) == len(categories) == len(scores) == len(bounding_boxes):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for rle, category, score, bbox in zip(rles, categories, scores, bounding_boxes):
+        objects = []
+        for r, cat, sc, bb in zip(rle, category, score, bbox):
+            objects.append(
+                instancesegmentationpb.InstanceSegmentationObject(
+                    rle=r,
+                    category=cat,
+                    score=sc,
+                    bounding_box=commonpb.BoundingBox(
+                        top=bb[0],
+                        left=bb[1],
+                        width=bb[2],
+                        height=bb[3],
+                    ),
                 )
-                print(
-                    f"[DEBUG] input `negative_prompt` type\
-                        ({type(text_to_image_input.negative_prompt)}): {text_to_image_input.negative_prompt}"
+            )
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    instance_segmentation=instancesegmentationpb.InstanceSegmentationOutput(
+                        objects=objects
+                    )
                 )
-
-            if input_name == "steps":
-                text_to_image_input.steps = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `steps` type\
-                        ({type(text_to_image_input.steps)}): {text_to_image_input.steps}"
+            )
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_semantic_segmentation_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        semantic_segmentation_pb: semanticsegmentationpb.SemanticSegmentationInput = (
+            task_input_pb.semantic_segmentation
+        )
+
+        inp = VisionInput()
+        if (
+            semantic_segmentation_pb.image_base64 != ""
+            and semantic_segmentation_pb.image_url != ""
+        ) or (
+            semantic_segmentation_pb.image_base64 == ""
+            and semantic_segmentation_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if semantic_segmentation_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(
+                semantic_segmentation_pb.image_base64,
+            )
+        elif semantic_segmentation_pb.image_url != "":
+            inp.image = url_to_pil_image(semantic_segmentation_pb.image_url)
+
+        input_list.append(inp)
+
+    return input_list
+
+
+def construct_task_semantic_segmentation_output(
+    rles: List[List[str]],
+    categories: List[List[str]],
+) -> TriggerResponse:
+    """Construct trigger output for semantic segmentation task
+
+    Args:
+        rles (List[List[str]]): for each image input, the list of detected object's rle
+        categories (List[List[str]]): for each image input, the list of detected object's category
+        (top, left, width, height)
+    """
+    if not len(rles) == len(categories):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for rle, category in zip(rles, categories):
+        objects = []
+        for r, cat in zip(rle, category):
+            objects.append(
+                semanticsegmentationpb.SemanticSegmentationStuff(
+                    rle=r,
+                    category=cat,
                 )
-
-            if input_name == "seed":
-                text_to_image_input.seed = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `seed` type\
-                        ({type(text_to_image_input.seed)}): {text_to_image_input.seed}"
+            )
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    instance_segmentation=semanticsegmentationpb.SemanticSegmentationOutput(
+                        stuffs=objects
+                    )
                 )
+            )
+        )
 
-            if input_name == "guidance_scale":
-                text_to_image_input.guidance_scale = struct.unpack("f", b_input_tensor)[
-                    0
-                ]
-                print(
-                    f"[DEBUG] input `guidance_scale` type\
-                        ({type(text_to_image_input.guidance_scale)}): {text_to_image_input.guidance_scale}"
-                )
-                text_to_image_input.guidance_scale = round(
-                    text_to_image_input.guidance_scale, 2
-                )
+    return TriggerResponse(task_outputs=task_outputs)
 
-            if input_name == "samples":
-                text_to_image_input.samples = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `samples` type\
-                        ({type(text_to_image_input.samples)}): {text_to_image_input.samples}"
-                )
 
-            if input_name == "extra_params":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                extra_params_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `extra_params` type\
-                        ({type(extra_params_str)}): {extra_params_str}"
-                )
+def parse_task_keypoint_to_vision_input(
+    request: TriggerRequest,
+) -> List[VisionInput]:
+    input_list = []
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
 
-                try:
-                    text_to_image_input.extra_params = json.loads(extra_params_str)
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+        keypoint_pb: keypointpb.KeypointInput = task_input_pb.keypoint
+
+        inp = VisionInput()
+        if (keypoint_pb.image_base64 != "" and keypoint_pb.image_url != "") or (
+            keypoint_pb.image_base64 == "" and keypoint_pb.image_url == ""
+        ):
+            raise InvalidInputException
+        if keypoint_pb.image_base64 != "":
+            inp.image = base64_to_pil_image(keypoint_pb.image_base64)
+        elif keypoint_pb.image_url != "":
+            inp.image = url_to_pil_image(keypoint_pb.image_url)
 
-        return text_to_image_input
+        input_list.append(inp)
 
-    @staticmethod
-    def parse_task_text_to_image_output(image):
-        return np.asarray(image).tobytes()
+    return input_list
 
-    @staticmethod
-    def parse_task_image_to_image_input(request) -> ImageToImageInput:
-        image_to_image_input = ImageToImageInput()
 
-        for i, b_input_tensor in zip(request.inputs, request.raw_input_contents):
-            input_name = i.name
+def construct_task_keypoint_output(
+    keypoints: List[List[List[tuple]]],
+    scores: List[List[float]],
+    bounding_boxes: List[List[tuple]],
+) -> TriggerResponse:
+    """Construct trigger output for keypoint task
 
-            if input_name == "prompt_image":
-                input_tensors = deserialize_bytes_tensor(b_input_tensor)
-                images = []
-                for enc in input_tensors:
-                    pil_img = Image.open(io.BytesIO(enc.astype(bytes)))  # RGB
-                    image = np.array(pil_img)
-                    if len(image.shape) == 2:  # gray image
+    Args:
+        keypoints (List[List[List[str]]]): for each image input, the list of detected object's keypoints,
+        with the format (x_coordinate, y_coordinate, visibility)
+        scores (List[List[float]]): for each image input, the list of detected object's score
+        bounding_boxes (List[List[tuple]]): for each image input, the list of detected object's bbox, with the format
+        (top, left, width, height)
+    """
+    if not len(keypoints) == len(scores) == len(bounding_boxes):
+        raise InvalidOutputShapeException
+
+    task_outputs = []
+    for keypoint, score, bbox in zip(keypoints, scores, bounding_boxes):
+        objects = []
+        for kps, sc, bb in zip(keypoint, score, bbox):
+            point_list = []
+            for kp in kps:
+                point_list.append(
+                    keypointpb.Keypoint(
+                        x=kp[0],
+                        y=kp[1],
+                        v=kp[2],
+                    )
+                )
+            objects.append(
+                keypointpb.KeypointObject(
+                    keypoints=point_list,
+                    score=sc,
+                    bounding_box=commonpb.BoundingBox(
+                        top=bb[0],
+                        left=bb[1],
+                        width=bb[2],
+                        height=bb[3],
+                    ),
+                )
+            )
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(keypoint=keypointpb.KeypointOutput(objects=objects))
+            )
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_text_generation_to_conversation_input(
+    request: TriggerRequest,
+) -> List[ConversationInput]:
+
+    input_list = []
+
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        text_generation_pb: textgenerationpb.TextGenerationInput = (
+            task_input_pb.text_generation
+        )
+
+        inp = ConversationInput()
+
+        conversation: List[Dict[str, str]] = []
+
+        # system message
+        if (
+            text_generation_pb.system_message is not None
+            and len(text_generation_pb.system_message) > 0
+        ):
+            conversation.append(
+                {"role": "system", "content": text_generation_pb.system_message}
+            )
+
+        # conversation history
+        if (
+            text_generation_pb.chat_history is not None
+            and len(text_generation_pb.chat_history) > 0
+        ):
+            for chat_entity in text_generation_pb.chat_history:
+                chat_message = None
+                if len(chat_entity["content"]) > 1:
+                    raise ValueError(
+                        "Multiple text message detected"
+                        " in a single chat history entity"
+                    )
+
+                if chat_entity["content"][0]["type"] == "text":
+                    if "Content" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["Content"]["Text"]
+                    elif "Text" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["Text"]
+                    elif "text" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["text"]
+                    else:
                         raise ValueError(
-                            f"The image shape with {image.shape} is "
-                            f"not in acceptable"
+                            f"Unknown structure of chat_history: {text_generation_pb.chat_history}"
                         )
-                    images.append(image)
-                image_to_image_input.prompt_image = images[0]
-                print(
-                    f"[DEBUG] input `prompt_image` type\
-                        ({type(image_to_image_input.prompt_image)}): {image_to_image_input.prompt_image}"
-                )
+                else:
+                    raise ValueError(
+                        "Unsupported chat_hisotry message type"
+                        ", expected 'text'"
+                        f" but get {chat_entity['content'][0]['type']}"
+                    )
 
-            if input_name == "prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                image_to_image_input.prompt = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `prompt` type\
-                        ({type(image_to_image_input.prompt)}): {image_to_image_input.prompt}"
-                )
+                if chat_entity["role"] not in PROMPT_ROLES:
+                    raise ValueError(
+                        f"Role `{chat_entity['role']}` is not in supported"
+                        f"role list ({','.join(PROMPT_ROLES)})"
+                    )
+                if (
+                    chat_entity["role"] == PROMPT_ROLES[-1]
+                    and text_generation_pb.system_message is not None
+                    and len(text_generation_pb.system_message) > 0
+                ):
+                    continue
+                if chat_message is None:
+                    raise ValueError(
+                        f"No message found in chat_history. {chat_message}"
+                    )
 
-            if input_name == "steps":
-                image_to_image_input.steps = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `steps` type\
-                        ({type(image_to_image_input.steps)}): {image_to_image_input.steps}"
-                )
+                if len(conversation) == 1 and chat_entity["role"] != PROMPT_ROLES[0]:
+                    conversation.append({"role": "user", "content": " "})
 
-            if input_name == "seed":
-                image_to_image_input.seed = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `seed` type\
-                        ({type(image_to_image_input.seed)}): {image_to_image_input.seed}"
-                )
+                if (
+                    len(conversation) > 0
+                    and conversation[-1]["role"] == chat_entity["role"]
+                ):
+                    last_conversation = conversation.pop()
+                    chat_message = f"{last_conversation['content']}\n\n{chat_message}"
 
-            if input_name == "guidance_scale":
-                image_to_image_input.guidance_scale = struct.unpack(
-                    "f", b_input_tensor
-                )[0]
-                print(
-                    f"[DEBUG] input `guidance_scale` type\
-                        ({type(image_to_image_input.guidance_scale)}): {image_to_image_input.guidance_scale}"
-                )
-                image_to_image_input.guidance_scale = round(
-                    image_to_image_input.guidance_scale, 2
+                conversation.append(
+                    {"role": chat_entity["role"], "content": chat_message}
                 )
 
-            if input_name == "samples":
-                image_to_image_input.samples = int.from_bytes(b_input_tensor, "little")
-                print(
-                    f"[DEBUG] input `samples` type\
-                        ({type(image_to_image_input.samples)}): {image_to_image_input.samples}"
-                )
+        # conversation
+        prompt = text_generation_pb.prompt
+        if len(conversation) > 0 and conversation[-1]["role"] == PROMPT_ROLES[0]:
+            last_conversation = conversation.pop()
+            prompt = f"{last_conversation['content']}\n\n{prompt}"
 
-            if input_name == "extra_params":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                extra_params_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `extra_params` type\
-                        ({type(extra_params_str)}): {extra_params_str}"
-                )
+        conversation.append({"role": "user", "content": prompt})
 
-                try:
-                    image_to_image_input.extra_params = json.loads(extra_params_str)
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+        inp.conversation = conversation
+
+        # max new tokens
+        if text_generation_pb.max_new_tokens is not None:
+            inp.max_new_tokens = text_generation_pb.max_new_tokens
 
-        return image_to_image_input
+        # temperature
+        if text_generation_pb.temperature is not None:
+            inp.temperature = text_generation_pb.temperature
 
-    @staticmethod
-    def parse_task_image_to_image_output(image):
-        return np.asarray(image).tobytes()
+        # top k
+        if text_generation_pb.top_k is not None:
+            inp.top_k = text_generation_pb.top_k
 
-    @staticmethod
-    def parse_task_text_generation_chat_input(request) -> TextGenerationChatInput:
-        text_generation_chat_input = TextGenerationChatInput()
+        # seed
+        if text_generation_pb.seed is not None:
+            inp.seed = text_generation_pb.seed
 
-        for i, b_input_tensor in zip(request.inputs, request.raw_input_contents):
-            input_name = i.name
+        input_list.append(inp)
 
-            if input_name == "prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_generation_chat_input.prompt = str(input_tensor[0].decode("utf-8"))
-                print(
-                    f"[DEBUG] input `prompt` type\
-                        ({type(text_generation_chat_input.prompt)}): {text_generation_chat_input.prompt}"
+    return input_list
+
+
+def construct_task_text_generation_output(texts: List[str]) -> TriggerResponse:
+    task_outputs = []
+
+    for text in texts:
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    text_generation=textgenerationpb.TextGenerationOutput(text=text)
                 )
+            )
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_text_generation_chat_to_conversation_input(
+    request: TriggerRequest,
+) -> List[ConversationInput]:
+
+    input_list = []
+
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        text_generation_chat_pb: textgenerationchatpb.TextGenerationChatInput = (
+            task_input_pb.text_generation_chat
+        )
+
+        inp = ConversationInput()
+
+        conversation = []
+
+        # system message
+        if (
+            text_generation_chat_pb.system_message is not None
+            and len(text_generation_chat_pb.system_message) > 0
+        ):
+            conversation.append(
+                {
+                    "role": "system",
+                    "content": text_generation_chat_pb.system_message,
+                }
+            )
+
+        # conversation history
+        if (
+            text_generation_chat_pb.chat_history is not None
+            and len(text_generation_chat_pb.chat_history) > 0
+        ):
+            for chat_entity in text_generation_chat_pb.chat_history:
+                chat_message = None
+                if len(chat_entity["content"]) > 1:
+                    raise ValueError(
+                        "Multiple text message detected"
+                        " in a single chat history entity"
+                    )
 
-            if input_name == "prompt_images":
-                input_tensors = deserialize_bytes_tensor(b_input_tensor)
-                images = []
-                for enc in input_tensors:
-                    if len(enc) == 0:
-                        continue
-                    try:
-                        enc_json = json.loads(str(enc.decode("utf-8")))
-                        if len(enc_json) == 0:
-                            continue
-                        decoded_enc = enc_json[0]
-                    except JSONDecodeError:
-                        print("[DEBUG] WARNING `enc_json` parsing faield!")
-                    # pil_img = Image.open(io.BytesIO(enc.astype(bytes)))  # RGB
-                    pil_img = Image.open(io.BytesIO(base64.b64decode(decoded_enc)))
-
-                    image = np.array(pil_img)
-                    if len(image.shape) == 2:  # gray image
+                if chat_entity["content"][0]["type"] == "text":
+                    if "Content" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["Content"]["Text"]
+                    elif "Text" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["Text"]
+                    elif "text" in chat_entity["content"][0]:
+                        chat_message = chat_entity["content"][0]["text"]
+                    else:
                         raise ValueError(
-                            f"The image shape with {image.shape} is "
-                            f"not in acceptable"
+                            f"Unknown structure of chat_history: {text_generation_chat_pb.chat_history}"
                         )
-                    images.append(image)
-                text_generation_chat_input.prompt_images = images
-                print(
-                    "[DEBUG] input `prompt_images` type"
-                    f"({type(text_generation_chat_input.prompt_images)}): "
-                    f"{text_generation_chat_input.prompt_images}"
-                )
+                else:
+                    raise ValueError(
+                        "Unsupported chat_hisotry message type"
+                        ", expected 'text'"
+                        f" but get {chat_entity['content'][0]['type']}"
+                    )
 
-            if input_name == "chat_history":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                chat_history_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `chat_history_str` type"
-                    f"({type(chat_history_str)}): "
-                    f"{chat_history_str}"
-                )
-                try:
-                    text_generation_chat_input.chat_history = json.loads(
-                        chat_history_str
+                if chat_entity["role"] not in PROMPT_ROLES:
+                    raise ValueError(
+                        f"Role `{chat_entity['role']}` is not in supported"
+                        f"role list ({','.join(PROMPT_ROLES)})"
                     )
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
+                if (
+                    chat_entity["role"] == PROMPT_ROLES[-1]
+                    and text_generation_chat_pb.system_message is not None
+                    and len(text_generation_chat_pb.system_message) > 0
+                ):
                     continue
+                if chat_message is None:
+                    raise ValueError(
+                        f"No message found in chat_history. {chat_message}"
+                    )
 
-            if input_name == "system_message":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_generation_chat_input.system_message = str(
-                    input_tensor[0].decode("utf-8")
-                )
-                print(
-                    "[DEBUG] input `system_message` type"
-                    f"({type(text_generation_chat_input.system_message)}): "
-                    f"{text_generation_chat_input.system_message}"
-                )
+                if len(conversation) == 1 and chat_entity["role"] != PROMPT_ROLES[0]:
+                    conversation.append({"role": "user", "content": " "})
 
-            if input_name == "max_new_tokens":
-                text_generation_chat_input.max_new_tokens = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `max_new_tokens` type"
-                    f"({type(text_generation_chat_input.max_new_tokens)}): "
-                    f"{text_generation_chat_input.max_new_tokens}"
-                )
+                if (
+                    len(conversation) > 0
+                    and conversation[-1]["role"] == chat_entity["role"]
+                ):
+                    last_conversation = conversation.pop()
+                    chat_message = f"{last_conversation['content']}\n\n{chat_message}"
 
-            if input_name == "top_k":
-                text_generation_chat_input.top_k = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `top_k` type"
-                    f"({type(text_generation_chat_input.top_k)}): "
-                    f"{text_generation_chat_input.top_k}"
+                conversation.append(
+                    {"role": chat_entity["role"], "content": chat_message}
                 )
 
-            if input_name == "temperature":
-                text_generation_chat_input.temperature = struct.unpack(
-                    "f", b_input_tensor
-                )[0]
-                print(
-                    "[DEBUG] input `temperature` type"
-                    f"({type(text_generation_chat_input.temperature)}): "
-                    f"{text_generation_chat_input.temperature}"
-                )
-                text_generation_chat_input.temperature = round(
-                    text_generation_chat_input.temperature, 2
-                )
+        # conversation
+        prompt = text_generation_chat_pb.prompt
+        if len(conversation) > 0 and conversation[-1]["role"] == PROMPT_ROLES[0]:
+            last_conversation = conversation.pop()
+            prompt = f"{last_conversation['content']}\n\n{prompt}"
 
-            if input_name == "random_seed":
-                text_generation_chat_input.random_seed = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `random_seed` type"
-                    f"({type(text_generation_chat_input.random_seed)}): "
-                    f"{text_generation_chat_input.random_seed}"
-                )
+        conversation.append({"role": "user", "content": prompt})
 
-            if input_name == "extra_params":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                extra_params_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `extra_params` type"
-                    f"({type(extra_params_str)}): "
-                    f"{extra_params_str}"
-                )
+        inp.conversation = conversation
 
-                try:
-                    text_generation_chat_input.extra_params = json.loads(
-                        extra_params_str
-                    )
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+        # max new tokens
+        if text_generation_chat_pb.max_new_tokens is not None:
+            inp.max_new_tokens = text_generation_chat_pb.max_new_tokens
 
-        return text_generation_chat_input
+        # temperature
+        if text_generation_chat_pb.temperature is not None:
+            inp.temperature = text_generation_chat_pb.temperature
 
-    @staticmethod
-    def parse_task_text_generation_chat_output(sequences: list):
-        text_outputs = [seq["generated_text"].encode("utf-8") for seq in sequences]
+        # top k
+        if text_generation_chat_pb.top_k is not None:
+            inp.top_k = text_generation_chat_pb.top_k
 
-        return serialize_byte_tensor(np.asarray(text_outputs))
+        # seed
+        if text_generation_chat_pb.seed is not None:
+            inp.seed = text_generation_chat_pb.seed
 
-    @staticmethod
-    def parse_task_visual_question_answering_input(
-        request,
-    ) -> VisualQuestionAnsweringInput:
-        text_visual_question_answering_input = VisualQuestionAnsweringInput()
+        input_list.append(inp)
 
-        for i, b_input_tensor in zip(request.inputs, request.raw_input_contents):
-            input_name = i.name
+    return input_list
 
-            if input_name == "prompt":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_visual_question_answering_input.prompt = str(
-                    input_tensor[0].decode("utf-8")
-                )
-                print(
-                    "[DEBUG] input `prompt` type"
-                    f"({type(text_visual_question_answering_input.prompt)}): "
-                    f"{text_visual_question_answering_input.prompt}"
-                )
 
-            if input_name == "prompt_images":
-                input_tensors = deserialize_bytes_tensor(b_input_tensor)
-                images = []
-                for enc in input_tensors:
-                    if len(enc) == 0:
-                        continue
-                    try:
-                        enc_json = json.loads(str(enc.decode("utf-8")))
-                        if len(enc_json) == 0:
-                            continue
-                        decoded_enc = enc_json[0]
-                    except JSONDecodeError:
-                        print("[DEBUG] WARNING `enc_json` parsing faield!")
-                    # pil_img = Image.open(io.BytesIO(enc.astype(bytes)))  # RGB
-                    pil_img = Image.open(io.BytesIO(base64.b64decode(decoded_enc)))
-
-                    image = np.array(pil_img)
-                    if len(image.shape) == 2:  # gray image
-                        raise ValueError(
-                            f"The image shape with {image.shape} is "
-                            f"not in acceptable"
-                        )
-                    images.append(image)
-                # TODO: check wethere there are issues in batch size dimention
-                text_visual_question_answering_input.prompt_images = images
-                print(
-                    "[DEBUG] input `prompt_images` type"
-                    f"({type(text_visual_question_answering_input.prompt_images)}): "
-                    f"{text_visual_question_answering_input.prompt_images}"
-                )
+def construct_task_text_generation_chat_output(texts: List[str]) -> TriggerResponse:
+    task_outputs = []
 
-            if input_name == "chat_history":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                chat_history_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `chat_history_str` type"
-                    f"({type(chat_history_str)}): "
-                    f"{chat_history_str}"
+    for text in texts:
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    text_generation_chat=textgenerationchatpb.TextGenerationChatOutput(
+                        text=text
+                    )
                 )
-                try:
-                    text_visual_question_answering_input.chat_history = json.loads(
-                        chat_history_str
+            )
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_visual_question_answering_to_conversation_multimodal_input(
+    request: TriggerRequest,
+) -> List[ConversationMultiModelInput]:
+
+    input_list = []
+
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        visual_question_answering_pb: (
+            visualquestionansweringpb.VisualQuestionAnsweringInput
+        ) = task_input_pb.visual_question_answering
+
+        inp = ConversationMultiModelInput()
+
+        conversation: List[Union[Dict[str, Union[str, Dict[str, str]]]]] = []
+
+        # system message
+        if (
+            visual_question_answering_pb.system_message is not None
+            and len(visual_question_answering_pb.system_message) > 0
+        ):
+            conversation.append(
+                {
+                    "role": "system",
+                    "content": {
+                        "type": "text",
+                        "content": visual_question_answering_pb.system_message,
+                    },
+                }
+            )
+
+        # conversation history
+        if (
+            visual_question_answering_pb.chat_history is not None
+            and len(visual_question_answering_pb.chat_history) > 0
+        ):
+            for chat_entity in visual_question_answering_pb.chat_history:
+                chat_dict = json_format.MessageToDict(chat_entity)
+                conversation.append(chat_dict)
+
+        # conversation
+        prompt = visual_question_answering_pb.prompt
+        if len(conversation) > 0 and conversation[-1]["role"] == PROMPT_ROLES[0]:
+            last_conversation = conversation.pop()
+            prompt = f"{last_conversation['content']['content']}\n\n{prompt}"  # type: ignore
+
+        conversation.append(
+            {
+                "role": "user",
+                "content": {
+                    "type": "text",
+                    "content": prompt,
+                },
+            }
+        )
+
+        inp.conversation = conversation
+
+        # prompt images
+        prompt_image_list = []
+        if (
+            visual_question_answering_pb.prompt_images is not None
+            and len(visual_question_answering_pb.prompt_images) > 0
+        ):
+            for prompt_image in visual_question_answering_pb.prompt_images:
+                if (
+                    prompt_image.prompt_image_base64 != ""
+                    and prompt_image.prompt_image_url != ""
+                ) or (
+                    prompt_image.prompt_image_base64 == ""
+                    and prompt_image.prompt_image_url == ""
+                ):
+                    raise InvalidInputException
+                if prompt_image.prompt_image_base64 != "":
+                    prompt_image_list.append(
+                        base64_to_pil_image(prompt_image.prompt_image_base64)
                     )
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+                elif prompt_image.prompt_image_url != "":
+                    prompt_image_list.append(
+                        url_to_pil_image(prompt_image.prompt_image_url)
+                    )
+            inp.prompt_images = prompt_image_list
 
-            if input_name == "system_message":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                text_visual_question_answering_input.system_message = str(
-                    input_tensor[0].decode("utf-8")
-                )
-                print(
-                    "[DEBUG] input `system_message` type"
-                    f"({type(text_visual_question_answering_input.system_message)}): "
-                    f"{text_visual_question_answering_input.system_message}"
-                )
+        # max new tokens
+        if visual_question_answering_pb.max_new_tokens is not None:
+            inp.max_new_tokens = visual_question_answering_pb.max_new_tokens
 
-            if input_name == "max_new_tokens":
-                text_visual_question_answering_input.max_new_tokens = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `max_new_tokens` type"
-                    f"({type(text_visual_question_answering_input.max_new_tokens)}): "
-                    f"{text_visual_question_answering_input.max_new_tokens}"
-                )
+        # temperature
+        if visual_question_answering_pb.temperature is not None:
+            inp.temperature = visual_question_answering_pb.temperature
 
-            if input_name == "top_k":
-                text_visual_question_answering_input.top_k = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `top_k` type"
-                    f"({type(text_visual_question_answering_input.top_k)}): "
-                    f"{text_visual_question_answering_input.top_k}"
-                )
+        # top k
+        if visual_question_answering_pb.top_k is not None:
+            inp.top_k = visual_question_answering_pb.top_k
 
-            if input_name == "temperature":
-                text_visual_question_answering_input.temperature = struct.unpack(
-                    "f", b_input_tensor
-                )[0]
-                print(
-                    "[DEBUG] input `temperature` type"
-                    f"({type(text_visual_question_answering_input.temperature)}): "
-                    f"{text_visual_question_answering_input.temperature}"
-                )
-                text_visual_question_answering_input.temperature = round(
-                    text_visual_question_answering_input.temperature, 2
-                )
+        # seed
+        if visual_question_answering_pb.seed is not None:
+            inp.seed = visual_question_answering_pb.seed
 
-            if input_name == "random_seed":
-                text_visual_question_answering_input.random_seed = int.from_bytes(
-                    b_input_tensor, "little"
-                )
-                print(
-                    "[DEBUG] input `random_seed` type"
-                    f"({type(text_visual_question_answering_input.random_seed)}): "
-                    f"{text_visual_question_answering_input.random_seed}"
+        input_list.append(inp)
+
+    return input_list
+
+
+def construct_task_visual_question_answering_output(
+    texts: List[str],
+) -> TriggerResponse:
+    task_outputs = []
+
+    for text in texts:
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    visual_question_answering=visualquestionansweringpb.VisualQuestionAnsweringOutput(
+                        text=text
+                    )
                 )
+            )
+        )
+
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_text_to_image_input(
+    request: TriggerRequest,
+) -> List[TextToImageInput]:
+
+    input_list = []
+
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
+
+        text_to_image_pb: texttoimagepb.TextToImageInput = task_input_pb.text_to_image
+
+        inp = TextToImageInput()
+
+        # prompt
+        inp.prompt = text_to_image_pb.prompt
+
+        # steps
+        if text_to_image_pb.steps is not None:
+            inp.steps = text_to_image_pb.steps
+
+        # temperature
+        if text_to_image_pb.cfg_scale is not None:
+            inp.cfg_scale = text_to_image_pb.cfg_scale
+
+        # top k
+        if text_to_image_pb.samples is not None:
+            inp.samples = text_to_image_pb.samples
+
+        # seed
+        if text_to_image_pb.seed is not None:
+            inp.seed = text_to_image_pb.seed
+
+        input_list.append(inp)
+
+    return input_list
+
 
-            if input_name == "extra_params":
-                input_tensor = deserialize_bytes_tensor(b_input_tensor)
-                extra_params_str = str(input_tensor[0].decode("utf-8"))
-                print(
-                    "[DEBUG] input `extra_params` type"
-                    f"({type(extra_params_str)}): "
-                    f"{extra_params_str}"
+def construct_task_text_to_image_output(
+    images: List[List[str]],
+) -> TriggerResponse:
+    """Construct trigger output for keypoint task
+
+    Args:
+        images (List[List[str]]): for each input prompt, the generated images with the length of `samples`
+    """
+    task_outputs = []
+
+    for imgs in images:
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    text_to_image=texttoimagepb.TextToImageOutput(images=imgs)
                 )
+            )
+        )
 
-                try:
-                    text_visual_question_answering_input.extra_params = json.loads(
-                        extra_params_str
-                    )
-                except JSONDecodeError:
-                    print("[DEBUG] WARNING `extra_params` parsing faield!")
-                    continue
+    return TriggerResponse(task_outputs=task_outputs)
+
+
+def parse_task_image_to_image_input(
+    request: TriggerRequest,
+) -> List[ImageToImageInput]:
+
+    input_list = []
 
-        return text_visual_question_answering_input
+    for task_input in request.task_inputs:
+        task_input_pb = struct_to_protobuf(task_input, modelpb.TaskInput)
 
-    @staticmethod
-    def parse_task_visual_question_answering_output(sequences: list):
-        text_outputs = [seq["generated_text"].encode("utf-8") for seq in sequences]
+        image_to_image_pb: imagetoimagepb.ImageToImageInput = (
+            task_input_pb.text_to_image
+        )
 
-        return serialize_byte_tensor(np.asarray(text_outputs))
+        inp = ImageToImageInput()
 
+        # prompt
+        inp.prompt = image_to_image_pb.prompt
 
-class RawIO:
-    @staticmethod
-    def parse_byte_tensor(byte_tensor) -> List[str]:
-        input_tensors = deserialize_bytes_tensor(byte_tensor)
-        outs = [str(tensor.decode("utf-8")) for tensor in input_tensors]
+        # prompt images
+        if (
+            image_to_image_pb.prompt_image_base64 != ""
+            and image_to_image_pb.prompt_image_url != ""
+        ) or (
+            image_to_image_pb.prompt_image_base64 == ""
+            and image_to_image_pb.prompt_image_url == ""
+        ):
+            raise InvalidInputException
+        if image_to_image_pb.prompt_image_base64 != "":
+            inp.prompt_image = base64_to_pil_image(
+                image_to_image_pb.prompt_image_base64
+            )
+        elif image_to_image_pb.prompt_image_url != "":
+            inp.prompt_image = url_to_pil_image(image_to_image_pb.prompt_image_url)
 
-        return outs
+        # steps
+        if image_to_image_pb.steps is not None:
+            inp.steps = image_to_image_pb.steps
 
-    @staticmethod
-    def parse_unsigned_int_tensor(int_tensor) -> int:
-        return int.from_bytes(int_tensor, "little")
+        # temperature
+        if image_to_image_pb.cfg_scale is not None:
+            inp.cfg_scale = image_to_image_pb.cfg_scale
 
-    @staticmethod
-    def parse_signed_int_tensor(int_tensor) -> int:
-        return int.from_bytes(int_tensor, "little", signed=True)
+        # top k
+        if image_to_image_pb.samples is not None:
+            inp.samples = image_to_image_pb.samples
 
-    @staticmethod
-    def parse_float_tensor(float_tensor) -> float:
-        return struct.unpack("f", float_tensor)[0]
+        # seed
+        if image_to_image_pb.seed is not None:
+            inp.seed = image_to_image_pb.seed
+
+        input_list.append(inp)
+
+    return input_list
+
+
+def construct_task_image_to_image_output(
+    images: List[List[str]],
+) -> TriggerResponse:
+    """Construct trigger output for keypoint task
+
+    Args:
+        images (List[List[str]]): for each input prompt, the generated images with the length of `samples`
+    """
+    task_outputs = []
+
+    for imgs in images:
+        task_outputs.append(
+            protobuf_to_struct(
+                modelpb.TaskOutput(
+                    image_to_image=imagetoimagepb.ImageToImageOutput(images=imgs)
+                )
+            )
+        )
 
-    @staticmethod
-    def parse_boolean_tensor(bool_tensor) -> bool:
-        return struct.unpack("?", bool_tensor)[0]
+    return TriggerResponse(task_outputs=task_outputs)
diff --git a/poetry.lock b/poetry.lock
index 025cc837..72668b2a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -211,8 +211,8 @@ files = [
 lazy-object-proxy = ">=1.4.0"
 typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
 wrapt = [
-    {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
     {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
 ]
 
 [[package]]
@@ -720,10 +720,10 @@ isort = ">=4.3.21,<6.0"
 jinja2 = ">=2.10.1,<4.0"
 packaging = "*"
 pydantic = [
-    {version = ">=1.5.1,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version < \"3.10\""},
-    {version = ">=1.9.0,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
     {version = ">=1.10.0,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version >= \"3.12\" and python_version < \"4.0\""},
     {version = ">=1.10.0,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.5.1,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version < \"3.10\""},
+    {version = ">=1.9.0,<2.4.0 || >2.4.0,<3.0", extras = ["email"], markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
 ]
 pyyaml = ">=6.0.1"
 toml = {version = ">=0.10.0,<1.0.0", markers = "python_version < \"3.11\""}
@@ -888,24 +888,22 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth
 
 [[package]]
 name = "fastapi"
-version = "0.1.17"
+version = "0.109.2"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "fastapi-0.1.17-py3-none-any.whl", hash = "sha256:a6aaad2f60684477480ac9d7a1c95e67f4696a722f184db467494bfdd5b8f29d"},
-    {file = "fastapi-0.1.17.tar.gz", hash = "sha256:a9a9b6cc32c38bab27a6549b94c44a30c70b485bc789d03de3aa8725f3394be5"},
+    {file = "fastapi-0.109.2-py3-none-any.whl", hash = "sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d"},
+    {file = "fastapi-0.109.2.tar.gz", hash = "sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73"},
 ]
 
 [package.dependencies]
-pydantic = ">=0.17"
-starlette = ">=0.9.7"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
+starlette = ">=0.36.3,<0.37.0"
+typing-extensions = ">=4.8.0"
 
 [package.extras]
-all = ["aiofiles", "email_validator", "graphene", "itsdangerous", "jinja2", "python-multipart", "pyyaml", "requests", "ujson", "ujson", "uvicorn"]
-dev = ["passlib[bcrypt]", "pyjwt"]
-doc = ["markdown-include", "mkdocs", "mkdocs-material"]
-test = ["black", "email_validator", "isort", "mypy", "pytest (>=4.0.0)", "pytest-cov", "requests"]
+all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
 
 [[package]]
 name = "filelock"
@@ -2962,8 +2960,8 @@ files = [
 astroid = ">=2.12.13,<=2.14.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.2", markers = "python_version < \"3.11\""},
     {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
+    {version = ">=0.2", markers = "python_version < \"3.11\""},
 ]
 isort = ">=4.2.5,<6"
 mccabe = ">=0.6,<0.8"
@@ -3168,7 +3166,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -3350,8 +3347,8 @@ fastapi = {version = "*", optional = true, markers = "extra == \"serve\""}
 filelock = "*"
 frozenlist = "*"
 grpcio = [
-    {version = ">=1.32.0", optional = true, markers = "python_version < \"3.10\" and extra == \"serve\""},
     {version = ">=1.42.0", optional = true, markers = "python_version >= \"3.10\" and extra == \"serve\""},
+    {version = ">=1.32.0", optional = true, markers = "python_version < \"3.10\" and extra == \"serve\""},
 ]
 jsonschema = "*"
 memray = {version = "*", optional = true, markers = "sys_platform != \"win32\" and extra == \"serve\""}
@@ -3420,13 +3417,13 @@ rpds-py = ">=0.7.0"
 
 [[package]]
 name = "requests"
-version = "2.32.0"
+version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
-    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
 ]
 
 [package.dependencies]
@@ -3740,13 +3737,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
 
 [[package]]
 name = "starlette"
-version = "0.36.2"
+version = "0.36.3"
 description = "The little ASGI library that shines."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "starlette-0.36.2-py3-none-any.whl", hash = "sha256:e53e086e620ba715f0c187daca92927b722484d148e70921f0de075936119536"},
-    {file = "starlette-0.36.2.tar.gz", hash = "sha256:4134757b950f027c8f16028ec81787751bb44145df17c8b0fa84087b9851b42d"},
+    {file = "starlette-0.36.3-py3-none-any.whl", hash = "sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044"},
+    {file = "starlette-0.36.3.tar.gz", hash = "sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080"},
 ]
 
 [package.dependencies]
@@ -3887,6 +3884,20 @@ files = [
     {file = "types_PyYAML-6.0.12.12-py3-none-any.whl", hash = "sha256:c05bc6c158facb0676674b7f11fe3960db4f389718e19e62bd2b84d6205cfd24"},
 ]
 
+[[package]]
+name = "types-requests"
+version = "2.32.0.20240712"
+description = "Typing stubs for requests"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-requests-2.32.0.20240712.tar.gz", hash = "sha256:90c079ff05e549f6bf50e02e910210b98b8ff1ebdd18e19c873cd237737c1358"},
+    {file = "types_requests-2.32.0.20240712-py3-none-any.whl", hash = "sha256:f754283e152c752e46e70942fa2a146b5bc70393522257bb85bd1ef7e019dcc3"},
+]
+
+[package.dependencies]
+urllib3 = ">=2"
+
 [[package]]
 name = "typing-extensions"
 version = "4.10.0"
@@ -4439,4 +4450,4 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<3.13"
-content-hash = "1876de5ff065c34c43e4b345f4ba8963d82300262105ddfd30c7dd59438ffe35"
+content-hash = "025e3ea2f8e21e4bf117fe6acfd61ff0782d73cf182a19568b095a85ed2ada29"
diff --git a/pyproject.toml b/pyproject.toml
index d7c561db..9e2fccf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,15 +32,18 @@ grpcio = "^1.59.0"
 pyyaml = "^6.0.1"
 numpy = "^1.21.0"
 protobuf = "^4.24.2"
-types-protobuf = "^4.24.0.1"
-types-pyyaml = "^6.0.12.11"
+requests = "^2.32.3"
+fastapi = "^0.109.2"
 google-api-core = "^2.11.1"
 googleapis-common-protos = "^1.60.0"
 protoc-gen-openapiv2 = "^0.0.1"
 pydantic = ">=1.10.13"
 pillow = "^10.1.0"
-ray = {version = "2.21.0", extras = ["serve"]}
+ray = { version = "2.21.0", extras = ["serve"] }
 jsonschema = "^4.20.0"
+types-requests = "^2.32.0.20240712"
+types-protobuf = "^4.24.0.1"
+types-pyyaml = "^6.0.12.11"
 
 [tool.poetry.dev-dependencies]