camel-ai · harryeqs · Jan 8, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 15, 2025
diff --git a/camel/toolkits/__init__.py b/camel/toolkits/__init__.py
@@ -43,7 +43,10 @@
 from .notion_toolkit import NotionToolkit
 from .human_toolkit import HumanToolkit
 from .stripe_toolkit import StripeToolkit
-from .video_toolkit import VideoDownloaderToolkit
+from .audio_analysis_toolkit import AudioAnalysisToolkit
+from .image_analysis_toolkit import ImageAnalysisToolkit
+from .video_analysis_toolkit import VideoAnalysisToolkit
+from .video_downloader_toolkit import VideoDownloaderToolkit
 from .dappier_toolkit import DappierToolkit
 
 __all__ = [
@@ -73,6 +76,9 @@
     'ArxivToolkit',
     'HumanToolkit',
     'VideoDownloaderToolkit',
+    'AudioAnalysisToolkit',
+    'ImageAnalysisToolkit',
+    'VideoAnalysisToolkit',
     'StripeToolkit',
     'MeshyToolkit',
     'OpenBBToolkit',

diff --git a/camel/toolkits/audio_analysis_toolkit.py b/camel/toolkits/audio_analysis_toolkit.py
@@ -0,0 +1,116 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+import base64
+import logging
+import os
+from typing import List, Optional
+from urllib.parse import urlparse
+
+import openai
+import requests
+
+from camel.toolkits.base import BaseToolkit
+from camel.toolkits.function_tool import FunctionTool
+
+logger = logging.getLogger(__name__)
+
+
+class AudioAnalysisToolkit(BaseToolkit):
+    r"""A class representing a toolkit for audio operations.
+
+    This class provides methods for processing and understanding audio data.
+    """
+
+    def __init__(self, cache_dir: Optional[str] = None):
+        self.cache_dir = 'tmp/'
+        if cache_dir:
+            self.cache_dir = cache_dir
+
+        self.audio_client = openai.OpenAI()
+
+    def ask_question_about_audio(self, audio_path: str, question: str) -> str:
+        r"""Ask any question about the audio and get the answer using
+            multimodal model.
+
+        Args:
+            audio_path (str): The path to the audio file.
+            question (str): The question to ask about the audio.
+
+        Returns:
+            str: The answer to the question.
+        """
+
+        logger.debug(
+            f"Calling ask_question_about_audio method for audio file \
+            `{audio_path}` and question `{question}`."
+        )
+
+        parsed_url = urlparse(audio_path)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
+        encoded_string = None
+
+        if is_url:
+            res = requests.get(audio_path)
+            res.raise_for_status()
+            audio_data = res.content
+            encoded_string = base64.b64encode(audio_data).decode('utf-8')
+        else:
+            with open(audio_path, "rb") as audio_file:
+                audio_data = audio_file.read()
+            audio_file.close()
+            encoded_string = base64.b64encode(audio_data).decode('utf-8')
+
+        file_suffix = os.path.splitext(audio_path)[1]
+        file_format = file_suffix[1:]
+
+        text_prompt = f"""Answer the following question based on the given \
+        audio information:\n\n{question}"""
+
+        completion = self.audio_client.chat.completions.create(
+            model="gpt-4o-mini-audio-preview",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant specializing in \
+                    audio analysis.",
+                },
+                {  # type: ignore[list-item, misc]
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text_prompt},
+                        {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": encoded_string,
+                                "format": file_format,
+                            },
+                        },
+                    ],
+                },
+            ],
+        )  # type: ignore[misc]
+
+        response: str = str(completion.choices[0].message.content)
+        logger.debug(f"Response: {response}")
+        return str(response)
+
+    def get_tools(self) -> List[FunctionTool]:
+        r"""Returns a list of FunctionTool objects representing the functions
+            in the toolkit.
+
+        Returns:
+            List[FunctionTool]: A list of FunctionTool objects representing the
+                functions in the toolkit.
+        """
+        return [FunctionTool(self.ask_question_about_audio)]
diff --git a/camel/toolkits/image_analysis_toolkit.py b/camel/toolkits/image_analysis_toolkit.py
@@ -0,0 +1,97 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+import base64
+import logging
+from typing import List
+from urllib.parse import urlparse
+
+from camel.models import OpenAIModel
+from camel.toolkits.base import BaseToolkit
+from camel.toolkits.function_tool import FunctionTool
+from camel.types import ModelType
+
+logger = logging.getLogger(__name__)
+
+
+class ImageAnalysisToolkit(BaseToolkit):
+    r"""A class representing a toolkit for image comprehension operations.
+
+    This class provides methods for understanding images, such as identifying
+    objects, text in images.
+    """
+
+    def _encode_image(self, image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def ask_question_about_image(self, question: str, image_path: str) -> str:
+        r"""Ask a question about the image based on the image path.
+
+        Args:
+            question (str): The question to ask about the image.
+            image_path (str): The path to the image file.
+
+        Returns:
+            str: The answer to the question based on the image.
+        """
+        logger.debug(
+            f"Calling ask_image_by_path with question: `{question}` and \
+            image_path: `{image_path}`"
+        )
+        parsed_url = urlparse(image_path)
+        is_url = all([parsed_url.scheme, parsed_url.netloc])
+
+        _image_url = image_path
+
+        if not is_url:
+            _image_url = (
+                f"data:image/jpeg;base64,{self._encode_image(image_path)}"
+            )
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant for \
+                image relevant tasks.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {'type': 'text', 'text': question},
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': _image_url,
+                        },
+                    },
+                ],
+            },
+        ]
+
+        LLM = OpenAIModel(model_type=ModelType.DEFAULT)
+        resp = LLM.run(messages)  # type: ignore[arg-type]
+
+        return str(resp.choices[0].message.content)  # type: ignore[union-attr]
+
+    def get_tools(self) -> List[FunctionTool]:
+        r"""Returns a list of FunctionTool objects representing the functions
+        in the toolkit.
+
+        Returns:
+            List[FunctionTool]: A list of FunctionTool objects representing the
+                functions in the toolkit.
+        """
+        return [
+            FunctionTool(self.ask_question_about_image),
+        ]