Merge pull request #30 from keitakn/feature/issue29

にじボイスのAPIからBase64の音声データを返すAPIが出たのでそれを利用するように変更
keitakn · Jan 13, 2025 · 178c1d6 · 178c1d6
2 parents a1ee62a + 7297163
commit 178c1d6
Show file tree

Hide file tree

Showing 7 changed files with 17 additions and 221 deletions.
diff --git a/backend/README.md b/backend/README.md
@@ -17,10 +17,6 @@ MacOSを利用する前提の手順になります。
 ```bash
 export GEMINI_API_KEY="https://aistudio.google.com/ で発行したAPIキー"
 export NIJIVOICE_API_KEY="https://platform.nijivoice.com/ で発行したAPIキー"
-export R2_ENDPOINT_URL="Cloudflareで作成したR2バケットのエンドポイントURLを指定（S3 APIの値）"
-export R2_ACCESS_KEY_ID="Cloudflareで作成したアクセスキーID"
-export R2_SECRET_ACCESS_KEY="Cloudflareで作成したアクセスシークレットキー"
-export R2_BUCKET_NAME="Cloudflareで作成したR2バケット名"
 ```
 
 ### uvのインストール

diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
@@ -8,10 +8,6 @@ services:
     environment:
       GEMINI_API_KEY: ${GEMINI_API_KEY}
       NIJIVOICE_API_KEY: ${NIJIVOICE_API_KEY}
-      R2_ENDPOINT_URL: ${R2_ENDPOINT_URL}
-      R2_ACCESS_KEY_ID: ${R2_ACCESS_KEY_ID}
-      R2_SECRET_ACCESS_KEY: ${R2_SECRET_ACCESS_KEY}
-      R2_BUCKET_NAME: ${R2_BUCKET_NAME}
     volumes:
       - ./Makefile:/Makefile
       - ./pyproject.toml:/pyproject.toml

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -5,7 +5,6 @@ description = "AIとのリアルタイムなやり取りを行う為の実験用
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
-    "boto3>=1.35.91",
     "fastapi>=0.115.6",
     "google-genai>=0.4.0",
     "types-requests>=2.32.0.20241016",

diff --git a/backend/src/presentation/router/realtime_apis.py b/backend/src/presentation/router/realtime_apis.py
@@ -8,62 +8,12 @@
 from google import genai
 from google.genai.live import AsyncSession  # noqa: F401
 from log.logger import AppLogger
-import boto3
-from botocore.config import Config
-import uuid
-from datetime import datetime
 
 router = APIRouter()
 app_logger = AppLogger()
 
-# R2の設定
-r2 = boto3.client(
-    "s3",
-    endpoint_url=os.getenv("R2_ENDPOINT_URL"),
-    aws_access_key_id=os.getenv("R2_ACCESS_KEY_ID"),
-    aws_secret_access_key=os.getenv("R2_SECRET_ACCESS_KEY"),
-    config=Config(signature_version="s3v4"),
-    region_name="auto",
-)
-
-R2_BUCKET_NAME = os.getenv("R2_BUCKET_NAME")
-
-
-async def upload_to_r2(audio_url: str) -> str:
-    """
-    TTSから取得した音声ファイルをR2にアップロードし、署名付きURLを生成する
-    """
-    try:
-        # TTSの音声ファイルをダウンロード
-        response = requests.get(audio_url)
-        response.raise_for_status()
-
-        # 現在の日時を取得
-        now = datetime.now()
-
-        # UUIDを生成してファイルパスを構築
-        directory_uuid = str(uuid.uuid4())
-        file_key = f"anonymous-users/generated-audio-files/year={now.year:04d}/month={now.month:02d}/date={now.day:02d}/{directory_uuid}/audio.wav"
-
-        # R2にアップロード
-        r2.put_object(
-            Bucket=R2_BUCKET_NAME,
-            Key=file_key,
-            Body=response.content,
-            ContentType="audio/wav",
-        )
-
-        # 署名付きURLを生成（有効期限1時間）
-        url: str = r2.generate_presigned_url(
-            "get_object",
-            Params={"Bucket": R2_BUCKET_NAME, "Key": file_key},
-            ExpiresIn=3600,
-        )
-
-        return url
-    except Exception as e:
-        app_logger.logger.error(f"R2へのアップロード中にエラーが発生: {e}")
-        raise e
+TTS_API_URL = "https://api.nijivoice.com/api/platform/v1/voice-actors/16e979a8-cd0f-49d4-a4c4-7a25aa42e184/generate-encoded-voice"
+TTS_API_KEY = os.getenv("NIJIVOICE_API_KEY")
 
 
 class SendEmailDto(TypedDict):
@@ -215,9 +165,6 @@ async def create_google_calendar_event(
     "system_instruction": system_prompt,
 }
 
-TTS_API_URL = "https://api.nijivoice.com/api/platform/v1/voice-actors/16e979a8-cd0f-49d4-a4c4-7a25aa42e184/generate-voice"
-TTS_API_KEY = os.getenv("NIJIVOICE_API_KEY")
-
 
 @router.websocket("/realtime-apis/video-chat")
 async def video_chat_websocket_endpoint(websocket: WebSocket) -> None:
@@ -443,23 +390,17 @@ async def receive_from_gemini() -> None:
                                         tts_data = tts_response.json()
                                         if (
                                             "generatedVoice" in tts_data
-                                            and "audioFileUrl"
+                                            and "base64Audio"
                                             in tts_data["generatedVoice"]
                                         ):
-                                            tts_audio_url = tts_data["generatedVoice"][
-                                                "audioFileUrl"
+                                            base64_audio = tts_data["generatedVoice"][
+                                                "base64Audio"
                                             ]
-
-                                            # R2にアップロードして署名付きURLを取得
-                                            r2_audio_url = await upload_to_r2(
-                                                tts_audio_url
-                                            )
-
                                             await websocket.send_text(
-                                                json.dumps({"audio": r2_audio_url})
+                                                json.dumps({"audio": base64_audio})
                                             )
 
-                                        combined_text = ""
+                                            combined_text = ""
 
                                     # クライアント側にAI Assistantのターンが終わった事を知らせる
                                     await websocket.send_text(

diff --git a/backend/uv.lock b/backend/uv.lock
diff --git a/frontend/src/app/_components/InputPromptForm.tsx b/frontend/src/app/_components/InputPromptForm.tsx
@@ -48,10 +48,9 @@ class Response {
 type Message = {
   role: 'user' | 'assistant';
   message: string;
-  audioUrl?: string;
 };
 
-const log = logger.child({ module: 'src/app/_components/InputPromptForm.tsx' });
+const log = logger.child({ module: 'InputPromptForm' });
 
 export function InputPromptForm() {
   const [prompt, setPrompt] = useState<string>('');
@@ -141,8 +140,13 @@ export function InputPromptForm() {
         return;
       }
 
-      const response = await fetch(audioUrl.current);
-      const arrayBuffer = await response.arrayBuffer();
+      // Base64データをデコードしてArrayBufferに変換
+      const binaryString = atob(audioUrl.current);
+      const bytes = new Uint8Array(binaryString.length);
+      for (let i = 0; i < binaryString.length; i++) {
+        bytes[i] = binaryString.charCodeAt(i);
+      }
+      const arrayBuffer = bytes.buffer;
 
       const audioBuffer = await playAudioContextRef.current.decodeAudioData(arrayBuffer);
 
@@ -203,7 +207,6 @@ export function InputPromptForm() {
           setMessages(prev => [...prev, {
             role: 'assistant',
             message: lastAssistantMessage,
-            audioUrl: audioUrl.current || undefined,
           }]);
           newResponseMessage = '';
           setStreamingMessage('');
@@ -436,7 +439,6 @@ export function InputPromptForm() {
               avatar="/omochi.png"
               message={message.message}
               showFeedback
-              audioUrl={message.audioUrl}
             />
           );
         })}

diff --git a/frontend/src/app/_components/MessageCard.tsx b/frontend/src/app/_components/MessageCard.tsx
@@ -3,7 +3,7 @@
 import { Icon } from '@iconify/react';
 import { Avatar, Badge, Button, cn, Link, Tooltip } from '@nextui-org/react';
 import { useClipboard } from '@nextui-org/use-clipboard';
-import { type HTMLAttributes, type ReactNode, type RefObject, useCallback, useEffect, useRef, useState } from 'react';
+import { type HTMLAttributes, type ReactNode, type RefObject, useCallback, useRef, useState } from 'react';
 
 type Props = HTMLAttributes<HTMLDivElement> & {
   avatar?: string;
@@ -13,20 +13,17 @@ type Props = HTMLAttributes<HTMLDivElement> & {
   status?: 'success' | 'failed';
   attempts?: number;
   messageClassName?: string;
-  audioUrl?: string;
   onAttemptChange?: (attempt: number) => void;
   onMessageCopy?: (content: string | string[]) => void;
   onFeedback?: (feedback: 'like' | 'dislike') => void;
   onAttemptFeedback?: (feedback: 'like' | 'dislike' | 'same') => void;
 };
 
-export function MessageCard({ ref, avatar, message, showFeedback, attempts = 1, currentAttempt = 1, status, onMessageCopy, onAttemptChange, onFeedback, onAttemptFeedback, className, messageClassName, audioUrl, ...props }: Props & { ref?: RefObject<HTMLDivElement> }) {
+export function MessageCard({ ref, avatar, message, showFeedback, attempts = 1, currentAttempt = 1, status, onMessageCopy, onAttemptChange, onFeedback, onAttemptFeedback, className, messageClassName, ...props }: Props & { ref?: RefObject<HTMLDivElement> }) {
   const [feedback, setFeedback] = useState<'like' | 'dislike'>();
   const [attemptFeedback, setAttemptFeedback] = useState<'like' | 'dislike' | 'same'>();
-  const [isPlaying, setIsPlaying] = useState(false);
 
   const messageRef = useRef<HTMLDivElement>(null);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
 
   const { copied, copy } = useClipboard();
 
@@ -86,55 +83,6 @@ export function MessageCard({ ref, avatar, message, showFeedback, attempts = 1,
     [onAttemptFeedback],
   );
 
-  const handlePlayAudio = async () => {
-    if (!audioUrl)
-      return;
-
-    if (audioRef.current) {
-      if (isPlaying) {
-        audioRef.current.pause();
-        setIsPlaying(false);
-        return;
-      }
-    }
-
-    try {
-      const audio = new Audio(audioUrl);
-      audioRef.current = audio;
-
-      // iOS対応の設定を追加
-      audio.playsInline = true;
-      audio.webkitPlaysInline = true;
-
-      audio.addEventListener('ended', () => {
-        setIsPlaying(false);
-        audioRef.current = null;
-      });
-
-      audio.addEventListener('error', () => {
-        setIsPlaying(false);
-        audioRef.current = null;
-      });
-
-      await audio.play();
-      setIsPlaying(true);
-    }
-    catch (error) {
-      console.error('音声再生エラー:', error);
-      setIsPlaying(false);
-    }
-  };
-
-  // コンポーネントのアンマウント時にクリーンアップ
-  useEffect(() => {
-    return () => {
-      if (audioRef.current) {
-        audioRef.current.pause();
-        audioRef.current = null;
-      }
-    };
-  }, []);
-
   return (
     <div {...props} ref={ref} className={cn('flex gap-3', className)}>
       <div className="relative flex-none">
@@ -162,20 +110,6 @@ export function MessageCard({ ref, avatar, message, showFeedback, attempts = 1,
           </div>
           {showFeedback && !hasFailed && (
             <div className="absolute right-2 top-2 flex rounded-full bg-content2 shadow-small">
-              {audioUrl && (
-                <Button
-                  isIconOnly
-                  radius="full"
-                  size="sm"
-                  variant="light"
-                  onPress={handlePlayAudio}
-                >
-                  <Icon
-                    className="text-lg text-default-600"
-                    icon={isPlaying ? 'solar:pause-circle-linear' : 'solar:play-circle-linear'}
-                  />
-                </Button>
-              )}
               <Button isIconOnly radius="full" size="sm" variant="light" onPress={handleCopy}>
                 {copied
                   ? (