Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add multimodal (audio, image and video) analysis toolkits #1496

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion camel/toolkits/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
from .notion_toolkit import NotionToolkit
from .human_toolkit import HumanToolkit
from .stripe_toolkit import StripeToolkit
from .video_toolkit import VideoDownloaderToolkit
from .audio_analysis_toolkit import AudioAnalysisToolkit
from .image_analysis_toolkit import ImageAnalysisToolkit
from .video_analysis_toolkit import VideoAnalysisToolkit
from .video_downloader_toolkit import VideoDownloaderToolkit
from .dappier_toolkit import DappierToolkit

__all__ = [
Expand Down Expand Up @@ -73,6 +76,9 @@
'ArxivToolkit',
'HumanToolkit',
'VideoDownloaderToolkit',
'AudioAnalysisToolkit',
'ImageAnalysisToolkit',
'VideoAnalysisToolkit',
'StripeToolkit',
'MeshyToolkit',
'OpenBBToolkit',
Expand Down
116 changes: 116 additions & 0 deletions camel/toolkits/audio_analysis_toolkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import base64
import logging
import os
from typing import List, Optional
from urllib.parse import urlparse

import openai
import requests

from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool

logger = logging.getLogger(__name__)


class AudioAnalysisToolkit(BaseToolkit):
r"""A class representing a toolkit for audio operations.

This class provides methods for processing and understanding audio data.
"""

def __init__(self, cache_dir: Optional[str] = None):
self.cache_dir = 'tmp/'
if cache_dir:
self.cache_dir = cache_dir

self.audio_client = openai.OpenAI()

def ask_question_about_audio(self, audio_path: str, question: str) -> str:
r"""Ask any question about the audio and get the answer using
multimodal model.

Args:
audio_path (str): The path to the audio file.
question (str): The question to ask about the audio.

Returns:
str: The answer to the question.
"""

logger.debug(
f"Calling ask_question_about_audio method for audio file \
`{audio_path}` and question `{question}`."
)

parsed_url = urlparse(audio_path)
is_url = all([parsed_url.scheme, parsed_url.netloc])
encoded_string = None

if is_url:
res = requests.get(audio_path)
res.raise_for_status()
audio_data = res.content
encoded_string = base64.b64encode(audio_data).decode('utf-8')
else:
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_file.close()
encoded_string = base64.b64encode(audio_data).decode('utf-8')

file_suffix = os.path.splitext(audio_path)[1]
file_format = file_suffix[1:]

text_prompt = f"""Answer the following question based on the given \
audio information:\n\n{question}"""

completion = self.audio_client.chat.completions.create(
model="gpt-4o-mini-audio-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant specializing in \
audio analysis.",
},
{ # type: ignore[list-item, misc]
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": encoded_string,
"format": file_format,
},
},
],
},
],
) # type: ignore[misc]

response: str = str(completion.choices[0].message.content)
logger.debug(f"Response: {response}")
return str(response)

def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions
in the toolkit.

Returns:
List[FunctionTool]: A list of FunctionTool objects representing the
functions in the toolkit.
"""
return [FunctionTool(self.ask_question_about_audio)]
97 changes: 97 additions & 0 deletions camel/toolkits/image_analysis_toolkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import base64
import logging
from typing import List
from urllib.parse import urlparse

from camel.models import OpenAIModel
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
from camel.types import ModelType

logger = logging.getLogger(__name__)


class ImageAnalysisToolkit(BaseToolkit):
r"""A class representing a toolkit for image comprehension operations.

This class provides methods for understanding images, such as identifying
objects, text in images.
"""

def _encode_image(self, image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")

def ask_question_about_image(self, question: str, image_path: str) -> str:
r"""Ask a question about the image based on the image path.

Args:
question (str): The question to ask about the image.
image_path (str): The path to the image file.

Returns:
str: The answer to the question based on the image.
"""
logger.debug(
f"Calling ask_image_by_path with question: `{question}` and \
image_path: `{image_path}`"
)
parsed_url = urlparse(image_path)
is_url = all([parsed_url.scheme, parsed_url.netloc])

_image_url = image_path

if not is_url:
_image_url = (
f"data:image/jpeg;base64,{self._encode_image(image_path)}"
)

messages = [
{
"role": "system",
"content": "You are a helpful assistant for \
image relevant tasks.",
},
{
"role": "user",
"content": [
{'type': 'text', 'text': question},
{
'type': 'image_url',
'image_url': {
'url': _image_url,
},
},
],
},
]

LLM = OpenAIModel(model_type=ModelType.DEFAULT)
resp = LLM.run(messages) # type: ignore[arg-type]

return str(resp.choices[0].message.content) # type: ignore[union-attr]

def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions
in the toolkit.

Returns:
List[FunctionTool]: A list of FunctionTool objects representing the
functions in the toolkit.
"""
return [
FunctionTool(self.ask_question_about_image),
]
Loading
Loading