From a571787248b7994e3c4dd1c7599d28bdd588822d Mon Sep 17 00:00:00 2001
From: carboxaminoo <carboxaminoo@daum.net>
Date: Wed, 27 Mar 2024 12:28:21 +0900
Subject: [PATCH] Feat : version 1

---
 data/audio/audio_check_dB.py            |  52 +++++++++
 data/audio/audio_crop.py                | 107 ++++++++++++++++++
 data/crawling/crawling_detect.py        | 139 +++++++++++++++++++++++
 data/crawling/crawling_rename_video.py  |  30 +++++
 data/crawling/crawling_select_csv.py    |  42 +++++++
 data/crawling/crawling_urlsave.py       |  97 ++++++++++++++++
 data/crawling/crawling_videosave.py     |  53 +++++++++
 data/image_clipseg2.py                  | 104 +++++++++++++++++
 data/relabel/relabel_Vox_age.py         |  84 ++++++++++++++
 data/relabel/relabel_detect_getframe.py | 141 ++++++++++++++++++++++++
 data/relabel/relabel_select_csv.py      |  57 ++++++++++
 data/total_audio_video_image.py         | 102 +++++++++++++++++
 data/video/video_clipimage.py           |  31 ++++++
 data/video/video_download.py            |  74 +++++++++++++
 14 files changed, 1113 insertions(+)
 create mode 100644 data/audio/audio_check_dB.py
 create mode 100644 data/audio/audio_crop.py
 create mode 100644 data/crawling/crawling_detect.py
 create mode 100644 data/crawling/crawling_rename_video.py
 create mode 100644 data/crawling/crawling_select_csv.py
 create mode 100644 data/crawling/crawling_urlsave.py
 create mode 100644 data/crawling/crawling_videosave.py
 create mode 100644 data/image_clipseg2.py
 create mode 100644 data/relabel/relabel_Vox_age.py
 create mode 100644 data/relabel/relabel_detect_getframe.py
 create mode 100644 data/relabel/relabel_select_csv.py
 create mode 100644 data/total_audio_video_image.py
 create mode 100644 data/video/video_clipimage.py
 create mode 100644 data/video/video_download.py

diff --git a/data/audio/audio_check_dB.py b/data/audio/audio_check_dB.py
new file mode 100644
index 0000000..a1a4bda
--- /dev/null
+++ b/data/audio/audio_check_dB.py
@@ -0,0 +1,52 @@
+import librosa
+import numpy as np
+import matplotlib.pyplot as plt
+
+'''
+You can determine the minimum, maximum, and average dB values 
+to set a threshold for identifying voice regions based on dB levels. 
+After visually inspecting the waveform and setting a threshold,
+adding 80 to it, you can conveniently apply this threshold value to `audio_crop.py`.
+'''
+
+# Load audio file
+audio_path = "voice2face-data/audio/input.wav"
+y, sr = librosa.load(audio_path, sr=None)
+
+# Calculate spectrum and check maximum and minimum dB values
+D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
+max_db = np.max(D)
+min_db = np.min(D)
+
+# Set threshold value
+threshold_db = -60
+
+# Consider regions with dB values above the threshold as voice regions
+voice_indices = np.where(D > threshold_db)
+
+print("Threshold:", threshold_db)
+print("Maximum dB value in regions with voice:", np.max(D[voice_indices]))
+print("Minimum dB value in regions with voice:", np.min(D[voice_indices]))
+
+# Calculate average dB value in regions with voice
+average_db = np.mean(D[voice_indices])
+print("Average dB value in regions with voice:", average_db)
+
+# Plot waveform and spectrum
+plt.figure(figsize=(12, 6))
+
+# Plot waveform
+plt.subplot(2, 1, 1)
+plt.plot(y)
+plt.title("Waveform")
+plt.xlabel("Sample")
+plt.ylabel("Amplitude")
+
+# Plot spectrum
+plt.subplot(2, 1, 2)
+librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
+plt.colorbar(format='%+2.0f dB')
+plt.title('Log-frequency power spectrogram')
+
+plt.tight_layout()
+plt.show()
diff --git a/data/audio/audio_crop.py b/data/audio/audio_crop.py
new file mode 100644
index 0000000..8bb3f58
--- /dev/null
+++ b/data/audio/audio_crop.py
@@ -0,0 +1,107 @@
+import os
+import librosa
+import soundfile as sf
+import matplotlib.pyplot as plt
+from pydub import AudioSegment
+
+'''
+Extracts human voice segments from an audio file and creates a new audio file with the detected voice segments 
+within a 10-second duration.
+
+Args:
+    audio_file (str): Path to the input audio file. If the file format is .m4a, it will be converted to .wav.
+    
+Returns:
+    save_file (str): Path to the saved audio file with detected voice segments.
+'''
+
+def detect_human_voice(audio_file):
+    '''
+    Detects human voice segments in an audio file.
+
+    Args:
+        audio_file (str): Path to the input audio file.
+
+    Returns:
+        voice_indices (list): List containing indices of the detected voice segments.
+    '''
+    # Read the audio file
+    y, sr = librosa.load(audio_file, sr=None)
+
+    # Detect voice activity
+    # ----- Need to Modify threshold-----#
+    voice_segments = librosa.effects.split(y, top_db=18)
+
+    # Generate indices of voice segments
+    voice_indices = []
+    for start, end in voice_segments:
+        voice_indices.extend(range(start, end))
+
+    return voice_indices
+
+def save_full_audio_with_detected_voice(audio_file, save_file):
+    '''
+    Saves the full audio file with detected voice segments.
+
+    Args:
+        audio_file (str): Path to the input audio file.
+        save_file (str): Path to save the audio file with detected voice segments.
+    '''
+    # Read the entire audio file
+    y, sr = librosa.load(audio_file, sr=None)
+
+    # Detect human voice segments and get their indices
+    voice_indices = detect_human_voice(audio_file)
+
+    # Extract human voice segments using the indices
+    combined_audio = y[voice_indices]
+
+    # Save the extracted audio segments to a file
+    sf.write(save_file, combined_audio, sr)
+
+    # Visualize and save the waveform of the original and detected voice segments
+    plt.figure(figsize=(12, 6))
+
+    # Original audio waveform
+    plt.subplot(2, 1, 1)
+    plt.plot(y)
+    plt.title("Original Audio Waveform")
+    plt.xlabel("Sample")
+    plt.ylabel("Amplitude")
+
+    # Waveform of detected voice segments
+    plt.subplot(2, 1, 2)
+    plt.plot(combined_audio)
+    plt.title("Detected Voice Waveform")
+    plt.xlabel("Sample")
+    plt.ylabel("Amplitude")
+
+    plt.tight_layout()
+    save_path = os.path.join(os.path.dirname(save_file), 'result')
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    save_file_path = os.path.join(save_path, os.path.basename(save_file[:-4] + "_waveform_comparison.png"))
+    plt.savefig(save_file_path)
+
+    # Save the extracted audio segments to a file
+    audio_save_file_path = os.path.join(save_path, os.path.basename(save_file))
+    sf.write(audio_save_file_path, combined_audio, sr)
+
+    plt.show()
+
+# Define paths for the original file and the file to save with detected voice segments
+# ------Need to modify path------ #
+audio_file_path = "voice2face-data/audio/input.m4a"
+save_file_path = "voice2face-data/audio/detected_voice.wav"
+
+# Check if the file extension is ".m4a" for conversion and processing
+if audio_file_path.endswith('.m4a'):
+    # Convert m4a file to wav format
+    wav_path = audio_file_path[:-4] + ".wav"
+    audio = AudioSegment.from_file(audio_file_path)
+    audio.export(wav_path, format="wav")
+    # Process the converted wav file
+    save_full_audio_with_detected_voice(wav_path, save_file_path)
+else:
+    # Process the original file without conversion
+    save_full_audio_with_detected_voice(audio_file_path, save_file_path)
diff --git a/data/crawling/crawling_detect.py b/data/crawling/crawling_detect.py
new file mode 100644
index 0000000..917859f
--- /dev/null
+++ b/data/crawling/crawling_detect.py
@@ -0,0 +1,139 @@
+import os
+import pandas as pd
+from moviepy.editor import VideoFileClip
+import numpy as np
+import face_recognition
+import shutil
+
+'''
+Detects faces and audio in video clips and refines them.
+
+Extracts faces from the video clips and selects segments with audio to rebuild new videos.
+New videos are organized in the "processed_videos" folder.
+
+'''
+
+# Function to extract audio from video clips with detected faces
+def extract_audio_with_face(video_clip, start_time, end_time):
+    '''
+    Extracts audio from a video clip with detected faces within a specified time range.
+
+    Args:
+        video_clip (VideoFileClip): Input video clip.
+        start_time (float): Start time of the segment containing the detected faces.
+        end_time (float): End time of the segment containing the detected faces.
+
+    Returns:
+        audio (AudioClip): Extracted audio clip.
+    '''
+    audio = video_clip.audio.subclip(start_time, end_time)
+    return audio
+
+# Function to extract audio from video clips with detected faces in multiple segments
+def extract_audio_with_faces(video_clip, face_detections):
+    '''
+    Extracts audio from a video clip with detected faces in multiple segments.
+
+    Args:
+        video_clip (VideoFileClip): Input video clip.
+        face_detections (list): List of tuples containing start and end times of segments with detected faces.
+
+    Returns:
+        final_audio (ndarray): Concatenated audio array from all detected face segments.
+    '''
+    audio_clips = []
+
+    for start_time, end_time in face_detections:
+        audio_clip = extract_audio_with_face(video_clip, start_time, end_time)
+        audio_clips.append(audio_clip)
+
+    final_audio = np.concatenate([clip.to_soundarray() for clip in audio_clips])
+    return final_audio
+
+# Function to detect faces in video clips
+def detect_faces(video_clip):
+    '''
+    Detects faces in a video clip.
+
+    Args:
+        video_clip (VideoFileClip): Input video clip.
+
+    Returns:
+        face_detections (list): List of tuples containing start and end times of segments with detected faces.
+    '''
+    frames = [frame for frame in video_clip.iter_frames()]
+    frame_rate = video_clip.fps
+    frame_times = np.arange(len(frames)) / frame_rate
+    face_detections = []
+
+    for i, frame in enumerate(frames):
+        face_locations = face_recognition.face_locations(frame)
+        if face_locations:
+            start_time = frame_times[max(0, i - 1)]
+            end_time = frame_times[min(len(frames) - 1, i + 1)]
+            face_detections.append((start_time, end_time))
+
+    return face_detections
+
+# Function to create a new video from detected face segments
+def create_new_video(video_clip, face_detections, output_path):
+    '''
+    Creates a new video from detected face segments.
+
+    Args:
+        video_clip (VideoFileClip): Input video clip.
+        face_detections (list): List of tuples containing start and end times of segments with detected faces.
+        output_path (str): Path to save the new video.
+    '''
+    new_video_clip = None
+
+    for start_time, end_time in face_detections:
+        subclip = video_clip.subclip(start_time, end_time)
+        if new_video_clip is None:
+            new_video_clip = subclip
+        else:
+            new_video_clip = new_video_clip.append(subclip)
+
+    new_video_clip.write_videofile(output_path)
+
+# Read data from a CSV file
+csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv"
+df = pd.read_csv(csv_file_path)
+
+# Paths for input and output folders
+DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/"
+NEW_FOLDER = "/Users/imseohyeon/Documents/crawling/processed_videos/"
+
+# Create a new folder if it doesn't exist
+if not os.path.exists(NEW_FOLDER):
+    os.makedirs(NEW_FOLDER)
+
+# Process each video to extract audio from segments with detected faces and create new videos
+for idx, row in df.iterrows():
+    video_filename = f"{idx}_video.mp4"
+    video_path = os.path.join(DOWNLOAD_FOLDER, video_filename)
+
+    if os.path.exists(video_path):
+        try:
+            video_clip = VideoFileClip(video_path)
+            face_detections = detect_faces(video_clip)
+
+            if face_detections:
+                final_audio = extract_audio_with_faces(video_clip, face_detections)
+                output_path = os.path.join(NEW_FOLDER, f"{idx}_new_video.mp4")
+                create_new_video(video_clip, face_detections, output_path)
+
+                print(f"Processing complete for {video_filename}")
+            else:
+                print(f"No faces detected in {video_filename}")
+        except Exception as e:
+            print(f"Error processing {video_filename}: {e}")
+    else:
+        print(f"File {video_filename} does not exist.")
+
+# Move processed videos to another folder
+processed_files = os.listdir(NEW_FOLDER)
+for file in processed_files:
+    shutil.move(os.path.join(NEW_FOLDER, file), DOWNLOAD_FOLDER)
+
+print("All videos processed")
diff --git a/data/crawling/crawling_rename_video.py b/data/crawling/crawling_rename_video.py
new file mode 100644
index 0000000..3c005ad
--- /dev/null
+++ b/data/crawling/crawling_rename_video.py
@@ -0,0 +1,30 @@
+import os
+import pandas as pd
+
+'''
+Match the video names in the 'download' folder with the index in the CSV.
+This facilitates the subsequent video relabeling task.
+'''
+
+# Read links from the CSV file
+csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv"
+df = pd.read_csv(csv_file_path)
+
+# Path to the folder where downloaded videos are stored
+DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/"
+
+# Iterate over all files in the folder and rename them
+for filename in os.listdir(DOWNLOAD_FOLDER):
+    # Full path of the file
+    file_path = os.path.join(DOWNLOAD_FOLDER, filename)
+    # Check if the file is a .mp4 file
+    if filename.endswith(".mp4"):
+        # Extract the index value from the file name (assuming the video title is stored as the index)
+        idx = filename.split("_")[0]  # Example: "0_video.mp4" -> "0"
+        # Create a new file name
+        new_filename = f"{idx}_video.mp4"
+        # Create the new file path
+        new_file_path = os.path.join(DOWNLOAD_FOLDER, new_filename)
+        # Rename the file
+        os.rename(file_path, new_file_path)
+        print(f"File renamed: {filename} -> {new_filename}")
diff --git a/data/crawling/crawling_select_csv.py b/data/crawling/crawling_select_csv.py
new file mode 100644
index 0000000..1bab567
--- /dev/null
+++ b/data/crawling/crawling_select_csv.py
@@ -0,0 +1,42 @@
+import os
+import pandas as pd
+from argparse import ArgumentParser
+
+def parse_args():
+    parser = ArgumentParser()
+
+    # Conventional args
+    parser.add_argument('--csv_file', type=str, default='output_test.csv')
+    parser.add_argument('--data_path', type=str, default='origin/video')
+    parser.add_argument('--save_csv', type=str, default='new_output.csv')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def list_files_and_folders(data_path):
+    if os.path.isdir(data_path):
+        items = os.listdir(data_path)
+        return items
+    else:
+        return None
+
+def main(csv_file, data_path, save_csv):
+
+    csv_data = pd.read_csv(csv_file, header=None)
+    youtube_ids = list_files_and_folders(data_path)
+
+    for youtube_id in youtube_ids:
+        filtered_df = csv_data[csv_data[0].astype(str).str.contains(youtube_id)]
+        first_row = filtered_df.iloc[0:1]
+        file_name = list_files_and_folders(os.path.join(data_path, youtube_id))[0]
+        file_name_list = file_name.split("_")
+        first_row[4] =  file_name_list[0]
+        first_row[5] =  file_name_list[1]
+        first_row.to_csv(save_csv, mode="a", index=False, header=False)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(**args.__dict__)
\ No newline at end of file
diff --git a/data/crawling/crawling_urlsave.py b/data/crawling/crawling_urlsave.py
new file mode 100644
index 0000000..36d9c8d
--- /dev/null
+++ b/data/crawling/crawling_urlsave.py
@@ -0,0 +1,97 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+
+import requests
+from bs4 import BeautifulSoup
+import time
+import pandas as pd
+import os
+
+'''
+YouTube crawling using Selenium
+
+Saves information of approximately 162 videos including name, title, and URL to Youtube_search_df.csv.
+'''
+
+# Initialize WebDriver (executable_path not required as it's added to PATH)
+browser = webdriver.Chrome()
+
+# URL to access
+url = "https://youtube.com/"
+
+# Search keyword
+keyword = "Solo Travel"
+
+# Scroll until the specified line
+# finish_line = 40000 (about 162 videos)
+finish_line = 10000
+
+browser.maximize_window()
+browser.get(url)
+time.sleep(2)
+search = browser.find_element(By.NAME, "search_query")
+time.sleep(2)
+search.send_keys(keyword)
+search.send_keys(Keys.ENTER)
+
+# Switch to search result page for parsing
+present_url = browser.current_url
+browser.get(present_url)
+last_page_height = browser.execute_script("return document.documentElement.scrollHeight")
+
+# Scroll 100 times
+scroll_count = 0
+while scroll_count < 100:
+    # Scroll down
+    browser.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
+    time.sleep(2.0)       # Waiting for data to load when scrolling down
+    new_page_height = browser.execute_script("return document.documentElement.scrollHeight")
+    
+    # Increase scroll count
+    scroll_count += 1
+
+html_source = browser.page_source
+soup = BeautifulSoup(html_source, 'html.parser')
+
+# Retrieve all search results up to the finish line
+# Extract all content-related sections
+elem = soup.find_all("ytd-video-renderer", class_="style-scope ytd-item-section-renderer")
+
+# Retrieve necessary information
+df = []
+for t in elem[:100]:  # Retrieve only the first 100 video information
+    title = t.find("yt-formatted-string", class_="style-scope ytd-video-renderer").get_text()
+    name = t.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").get_text()
+    content_url = t.find("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")["href"]
+    df.append([name, title , 'https://www.youtube.com/'+content_url])
+
+## Save data
+# Create DataFrame
+new = pd.DataFrame(columns=['name', 'title' , 'url_link'])
+
+# Insert data
+for i in range(len(df)):
+    new.loc[i] = df[i]
+
+# Create directory to save data
+df_dir = "./data/"
+if not os.path.exists(df_dir):
+    os.makedirs(df_dir)
+
+# Save data
+new.to_csv(os.path.join(df_dir, "Youtube_search_df.csv"), index=True, encoding='utf8')  # Save with index
+
+## Save column information
+# Column description table
+col_names = ['name', 'title' ,'url_link']
+col_exp = ['Channel name', 'Video title', 'URL link']
+
+new_exp = pd.DataFrame({'col_names':col_names,
+                        'col_explanation':col_exp})
+
+# Save
+new_exp.to_csv(os.path.join(df_dir, "Youtube_col_exp.csv"), index=False, encoding='utf8')
+
+# Close the browser
+browser.close()
diff --git a/data/crawling/crawling_videosave.py b/data/crawling/crawling_videosave.py
new file mode 100644
index 0000000..8c7c422
--- /dev/null
+++ b/data/crawling/crawling_videosave.py
@@ -0,0 +1,53 @@
+import os
+import pandas as pd
+from pytube import YouTube
+import time
+
+'''
+Download videos from URLs obtained through 'crawling_urlave.py'.
+Videos are saved in the 'download' folder.
+'''
+
+# Read links from the CSV file
+csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv"
+df = pd.read_csv(csv_file_path)
+
+# Define the download folder path
+DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/"
+
+# Create the download folder if it doesn't exist
+if not os.path.exists(DOWNLOAD_FOLDER):
+    os.makedirs(DOWNLOAD_FOLDER)
+
+# Iterate over each video and download
+for idx, row in df.iterrows():
+    video_url = row['url_link']
+    try:
+        # Get video information using Pytube
+        yt = YouTube(video_url)
+        length_seconds = yt.length
+
+        # Set the filename
+        filename = f"{idx}_video.mp4"
+
+        # If the video length exceeds 5 minutes, download only the first 5 minutes
+        if length_seconds > 5 * 60:
+            print(f"{yt.title} video exceeds 5 minutes. Downloading only the first 5 minutes.")
+            stream = yt.streams.filter(adaptive=True, file_extension='mp4').first()
+            if stream:
+                print(f"Downloading: {yt.title}")
+                stream.download(output_path=DOWNLOAD_FOLDER, filename=filename)
+                print(f"{yt.title} download complete")
+            else:
+                print(f"No highest quality stream available for {yt.title}.")
+        else:
+            # Download the entire video for videos less than 5 minutes long
+            stream = yt.streams.get_highest_resolution()
+            if stream:
+                print(f"Downloading: {yt.title}")
+                stream.download(output_path=DOWNLOAD_FOLDER, filename=filename)
+                print(f"{yt.title} download complete")
+            else:
+                print(f"No highest quality stream available for {yt.title}.")
+    except Exception as e:
+        print(f"Failed to download {yt.title}: {e}")
diff --git a/data/image_clipseg2.py b/data/image_clipseg2.py
new file mode 100644
index 0000000..26f9521
--- /dev/null
+++ b/data/image_clipseg2.py
@@ -0,0 +1,104 @@
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+from PIL import Image
+import torch
+import numpy as np
+
+'''
+This script performs image segmentation using the CLIPSeg model based on provided text prompts. It loads an image, processes it with text prompts, and generates a segmented image based on the identified objects.
+
+Inputs:
+- image: The image to be segmented.
+- positive_prompts: Text prompts describing the objects to be identified, separated by commas.
+- negative_prompts: Text prompts describing the objects to be ignored, separated by commas.
+- threshold: Threshold value for segmentation, between 0 and 1.
+
+Outputs:
+- output_image: Segmented image with identified objects highlighted.
+- final_mask: Final mask representing the segmented areas.
+
+'''
+
+# load CLIPSeg model & processor
+processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+
+# image path & load
+image_path = "/root/project/voice2face-data/file/face_detected_256x256.png"
+image = Image.open(image_path)
+
+def process_image(image, positive_prompts, negative_prompts, threshold):
+    '''
+    This function performs image segmentation based on provided text prompts and threshold.
+
+    Args:
+    - image: PIL image object.
+    - positive_prompts: Text prompts describing the objects to be identified, separated by commas.
+    - negative_prompts: Text prompts describing the objects to be ignored, separated by commas.
+    - threshold: Threshold value for segmentation, between 0 and 1.
+
+    Returns:
+    - output_image: Segmented image with identified objects highlighted.
+    - final_mask: Final mask representing the segmented areas.
+    '''
+
+    # image segmentation with img & prompt
+    def get_masks(prompts, img, threshold):
+        prompts = prompts.split(",")
+        masks = []
+        for prompt in prompts:
+            inputs = processor(
+                text=prompt.strip(), images=image, padding="max_length", return_tensors="pt"
+            )
+            with torch.no_grad():
+                outputs = model(**inputs)
+                preds = outputs.logits
+
+            pred = torch.sigmoid(preds)
+            mat = pred.cpu().numpy()
+            mask = Image.fromarray(np.uint8(mat * 255), "L")
+            mask = mask.convert("RGB")
+            mask = mask.resize(image.size)
+            mask = np.array(mask)[:, :, 0]
+
+            # normalize the mask
+            mask_min = mask.min()
+            mask_max = mask.max()
+            mask = (mask - mask_min) / (mask_max - mask_min)
+            mask = mask > threshold
+            masks.append(mask)
+        return masks
+
+    # Make mask's Positive prompts, Negative prompts
+    positive_masks = get_masks(positive_prompts, image, threshold)
+    negative_masks = get_masks(negative_prompts, image, threshold)
+
+    # Make Result mask combined masks
+    pos_mask = np.any(np.stack(positive_masks), axis=0)
+    neg_mask = np.any(np.stack(negative_masks), axis=0)
+    final_mask = pos_mask & ~neg_mask
+
+    # Result image 
+    final_mask = Image.fromarray(final_mask.astype(np.uint8) * 255, "L")
+    output_image = Image.new("RGBA", image.size, (0, 0, 0, 0))
+    output_image.paste(image, mask=final_mask)
+    return output_image, final_mask
+
+# base prompt
+positive_prompts = "face"
+negative_prompts = "background"
+threshold = 0.5
+
+# 텍스트 프롬프트 및 임계값 설정
+# positive_prompts = input("what you want to identify (comma separated): ")
+# negative_prompts = input("what you want to ignore (comma separated): ")
+# threshold = float(input("enter the threshold value (between 0 and 1): "))
+
+# process of segmentation
+output_image, final_mask = process_image(image, positive_prompts, negative_prompts, threshold)
+
+# save result img
+output_image_path = "/root/project/voice2face-data/file/segmented_image.png"
+output_image.save(output_image_path)
+
+# print success message
+print("Segmented image saved successfully at:", output_image_path)
diff --git a/data/relabel/relabel_Vox_age.py b/data/relabel/relabel_Vox_age.py
new file mode 100644
index 0000000..842afa7
--- /dev/null
+++ b/data/relabel/relabel_Vox_age.py
@@ -0,0 +1,84 @@
+import cv2
+import argparse
+import os
+import csv
+from collections import Counter
+
+def predict_age(face):
+    '''
+    Function to predict age from a face image.
+    
+    Args:
+        face: Image to predict age from.
+
+    Returns:
+        age: Predicted age group index.
+    '''
+    blob = cv2.dnn.blobFromImage(face, 1.0, (227, 227), MODEL_MEAN_VALUES, swapRB=False)
+    ageNet.setInput(blob)
+    agePreds = ageNet.forward()
+    age = agePreds[0].argmax()
+    return age
+
+def count_and_print_age(folder_path):
+    '''
+    Function to count and print the most common age group from images in a folder.
+    
+    Args:
+        folder_path: Path to the folder containing images.
+
+    Returns:
+        most_common_age: Index of the most common age group.
+    '''
+    age_list = []
+    try:
+        for filename in os.listdir(folder_path):
+            if filename.endswith(".jpg"):
+                image_path = os.path.join(folder_path, filename)
+                frame = cv2.imread(image_path)
+                if frame is not None:
+                    age = predict_age(frame)
+                    age_list.append(age)
+    except FileNotFoundError:
+        print(f"Folder '{folder_path}' not found.")
+        return None
+
+    if age_list:
+        most_common_age = Counter(age_list).most_common(1)[0][0]
+        print("Most common age group index:", most_common_age)
+        return most_common_age
+    else:
+        print("No images detected in the folder.")
+        return None
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder', required=True, help='Path to the folder containing images.')
+    parser.add_argument('--csv_source', default="/home/carbox/Desktop/git/dataset/vox1/vox1_meta.csv")
+    parser.add_argument('--output_csv', default="/home/carbox/Desktop/git/dataset/test/csv/test.csv")
+    args = parser.parse_args()
+
+    ageProto = "weights/age_deploy.prototxt"
+    ageModel = "weights/age_net.caffemodel"
+
+    MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746)
+    ageList = ['0', '1', '2', '3', '4', '5', '6', '7']  # Index 형태로 출력하기 위해 수정
+
+    ageNet = cv2.dnn.readNet(ageModel, ageProto)
+
+    with open (args.output_csv, 'w',  newline='') as output_csv:
+        csvwriter = csv.writer(output_csv, delimiter='\t')
+
+        with open(args.csv_source, 'r') as file:
+            for idx, line in enumerate(file):
+                if idx == 0:
+                    csvwriter.writerow(line.strip().split("\t") + ["age"])  # 헤더 부분에 age를 추가하여 리스트를 연결
+                    continue
+                line = line.strip()  # Remove leading/trailing whitespaces
+                if line:
+                    fields = line.split("\t")
+                    image_name = fields[1]
+                    age_index = count_and_print_age(os.path.join(args.folder, image_name))
+                    if age_index is not None:
+                        fields.append(str(age_index))  # age_index를 문자열로 변환하여 fields에 추가
+                        csvwriter.writerow(fields)
diff --git a/data/relabel/relabel_detect_getframe.py b/data/relabel/relabel_detect_getframe.py
new file mode 100644
index 0000000..73915b4
--- /dev/null
+++ b/data/relabel/relabel_detect_getframe.py
@@ -0,0 +1,141 @@
+import cv2
+import math
+import argparse
+import os
+
+def highlightFace(net, frame, conf_threshold=0.7):
+    '''
+    Function to detect faces in a frame using a pre-trained deep learning model.
+    
+    Args:
+        net: Pre-trained deep learning model.
+        frame: Input frame.
+        conf_threshold: Confidence threshold for face detection.
+
+    Returns:
+        frameOpencvDnn: Copy of the input frame with face rectangles drawn.
+        faceInfo: List containing information about detected faces (bbox, center, width, height).
+    '''
+    frameOpencvDnn = frame.copy()
+    frameHeight = frameOpencvDnn.shape[0]
+    frameWidth = frameOpencvDnn.shape[1]
+    blob = cv2.dnn.blobFromImage(frameOpencvDnn, 1.0, (300, 300), [104, 117, 123], True, False)
+
+    net.setInput(blob)
+    detections = net.forward()
+    faceInfo = []
+    for i in range(detections.shape[2]):
+        confidence = detections[0, 0, i, 2]
+        if confidence > conf_threshold:
+            x1 = int(detections[0, 0, i, 3] * frameWidth)
+            y1 = int(detections[0, 0, i, 4] * frameHeight)
+            x2 = int(detections[0, 0, i, 5] * frameWidth)
+            y2 = int(detections[0, 0, i, 6] * frameHeight)
+            cx = (x1 + x2) // 2
+            cy = (y1 + y2) // 2
+            w = x2 - x1
+            h = y2 - y1
+            faceInfo.append({'bbox': (x1, y1, x2, y2), 'center': (cx, cy), 'width': w, 'height': h})
+            cv2.rectangle(frameOpencvDnn, (x1, y1), (x2, y2), (0, 255, 0), int(round(frameHeight / 150)), 8)
+    return frameOpencvDnn, faceInfo
+
+def save_frame(frame, output_folder, frame_count, folder_name, gender, age, center):
+    '''
+    Function to save a frame with a specific filename format.
+    
+    Args:
+        frame: Frame to be saved.
+        output_folder: Folder where frames will be saved.
+        frame_count: Frame count.
+        folder_name: Name of the folder containing the video.
+        gender: Gender of the detected face.
+        age: Age range of the detected face.
+        center: Center coordinates of the detected face bbox.
+    '''
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    subfolder_path = os.path.join(output_folder, folder_name)
+    if not os.path.exists(subfolder_path):
+        os.makedirs(subfolder_path)
+    cv2.imwrite(os.path.join(subfolder_path, f"{gender}_{age}_{center[0]}-{center[1]}.jpg"), frame)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--folder', default='/Users/imseohyeon/Documents/gad/video')
+parser.add_argument('--output_folder', default='frame')
+parser.add_argument('--capture_interval', type=int, default=50)  # Adjusted frame interval for capturing
+
+args = parser.parse_args()
+
+faceProto = "opencv_face_detector.pbtxt"
+faceModel = "opencv_face_detector_uint8.pb"
+ageProto = "age_deploy.prototxt"
+ageModel = "age_net.caffemodel"
+genderProto = "gender_deploy.prototxt"
+genderModel = "gender_net.caffemodel"
+
+MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746)
+ageList = ['(0-2)', '(4-6)', '(8-12)', '(15-20)', '(25-32)', '(38-43)', '(48-53)', '(60-100)']
+genderList = ['Male', 'Female']
+
+faceNet = cv2.dnn.readNet(faceModel, faceProto)
+ageNet = cv2.dnn.readNet(ageModel, ageProto)
+genderNet = cv2.dnn.readNet(genderModel, genderProto)
+
+for root, dirs, files in os.walk(args.folder):
+    for folder_name in dirs:
+        folder_path = os.path.join(root, folder_name)
+        for filename in os.listdir(folder_path):
+            if filename.endswith(".mp4"):
+                video_path = os.path.join(folder_path, filename)
+                break
+        else:
+            continue
+        
+        video = cv2.VideoCapture(video_path)
+        padding = 20
+        frame_count = 0
+        while True:  # Infinite loop for processing each frame
+            hasFrame, frame = video.read()
+            if not hasFrame:
+                break
+
+            resultImg, faceInfo = highlightFace(faceNet, frame)
+            if faceInfo:  # Process only if faces are detected
+                for faceData in faceInfo:
+                    bbox = faceData['bbox']
+                    center = faceData['center']
+                    width = faceData['width']
+                    height = faceData['height']
+                    
+                    face = frame[max(0, bbox[1] - padding): min(bbox[3] + padding, frame.shape[0] - 1),
+                                 max(0, bbox[0] - padding): min(bbox[2] + padding, frame.shape[1] - 1)]
+                    
+                    blob = cv2.dnn.blobFromImage(face, 1.0, (227, 227), MODEL_MEAN_VALUES, swapRB=False)
+                    genderNet.setInput(blob)
+                    genderPreds = genderNet.forward()
+                    gender = genderList[genderPreds[0].argmax()]
+                    print(f'Gender: {gender}')
+
+                    ageNet.setInput(blob)
+                    agePreds = ageNet.forward()
+                    age = ageList[agePreds[0].argmax()]
+                    print(f'Age: {age[1:-1]} years')
+
+                    cv2.putText(resultImg, f'{gender}, {age}', (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA)
+                    cv2.putText(resultImg, f'Center: ({center[0]}, {center[1]})', (bbox[0], bbox[1] - 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA)
+                    cv2.putText(resultImg, f'Box: ({bbox[0]}, {bbox[1]}, {width}, {height})', (bbox[0], bbox[1] - 70), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA)
+                    
+                    cv2.imshow("Detecting age and gender", resultImg)
+                    
+                    # Capture and save frames at specified intervals
+                    if frame_count % args.capture_interval == 0:
+                        save_frame(frame, args.output_folder, frame_count, folder_name, gender, age, center)
+            
+            frame_count += 1
+            
+            # Press 'q' to exit
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+        
+        video.release()
+        cv2.destroyAllWindows()
diff --git a/data/relabel/relabel_select_csv.py b/data/relabel/relabel_select_csv.py
new file mode 100644
index 0000000..0ece279
--- /dev/null
+++ b/data/relabel/relabel_select_csv.py
@@ -0,0 +1,57 @@
+import os
+import pandas as pd
+from argparse import ArgumentParser
+
+def parse_args():
+    parser = ArgumentParser()
+
+    # Conventional args
+    parser.add_argument('--csv_file', type=str, default='output_test.csv')
+    parser.add_argument('--data_path', type=str, default='origin/video')
+    parser.add_argument('--save_csv', type=str, default='new_output.csv')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def list_files_and_folders(data_path):
+    if os.path.isdir(data_path):
+        items = os.listdir(data_path)
+        return items
+    else:
+        return None
+
+def main(csv_file, data_path, save_csv):
+    ageList_s = [0,4,8,15,25,38,48,60]
+
+    csv_data = pd.read_csv(csv_file, header=None)
+    youtube_ids = list_files_and_folders(data_path)
+    count = 0
+    for youtube_id in youtube_ids:
+        filtered_df = csv_data[csv_data[0].astype(str).str.contains(youtube_id)]
+        first_row = filtered_df.iloc[0:1]
+        file_name = list_files_and_folders(os.path.join(data_path, youtube_id))
+        for i in file_name:
+            file_name_list = i.split("_")
+            if file_name_list[-1] =="Store":
+                continue
+            else:
+                print(youtube_id, file_name_list)
+
+                first_row[4] =  file_name_list[0]
+                
+            first_row[5] =  ageList_s.index(int(file_name_list[1]))
+
+            first_row.to_csv(save_csv, mode="a", index=False, header=False)
+
+        
+        count += 1
+        
+
+    print(count)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(**args.__dict__)
\ No newline at end of file
diff --git a/data/total_audio_video_image.py b/data/total_audio_video_image.py
new file mode 100644
index 0000000..0b6384a
--- /dev/null
+++ b/data/total_audio_video_image.py
@@ -0,0 +1,102 @@
+import os
+import numpy as np
+import librosa
+import soundfile as sf
+import matplotlib.pyplot as plt
+from pydub import AudioSegment
+import cv2
+from facenet_pytorch import MTCNN
+import subprocess
+
+# Initialize MTCNN for face detection
+mtcnn = MTCNN()
+
+from moviepy.editor import VideoFileClip
+
+# 1. 비디오에서 음성 추출
+def extract_audio_from_video(video_file, audio_file):
+    # ffmpeg를 사용하여 비디오에서 오디오 추출
+    command = f"ffmpeg -i {video_file} -vn -acodec pcm_s16le -ar 44100 -ac 2 {audio_file}"
+    subprocess.call(command, shell=True)
+
+# 2. 사람 음성부분 추출 
+def detect_human_voice(audio_file): 
+    # 오디오 파일에서 사람 음성부분을 감지하여 해당 인덱스 반환
+    y, sr = librosa.load(audio_file, sr=None)
+    voice_segments = librosa.effects.split(y, top_db=18)
+    voice_indices = []
+    for start, end in voice_segments:
+        if end - start >= sr * 1:  # 1초 이상인 경우에만 추가
+            voice_indices.extend(range(start, end))
+    return voice_indices 
+
+# 3. 음성부분만 모아 다시 저장. + 비디오도 이 간격 맞춰 다시 저장 
+def save_detected_voice(audio_file, video_file, save_audio_file, save_video_file):
+    # 감지된 사람 음성부분을 추출하여 저장
+    y, sr = librosa.load(audio_file, sr=None)
+    voice_indices = detect_human_voice(audio_file)
+    combined_audio = y[voice_indices]
+    sf.write(save_audio_file, combined_audio, sr)
+
+    # 비디오도 해당 음성에 맞게 잘라서 저장
+    audio_clip = AudioSegment.from_wav(save_audio_file)
+    video_clip = VideoFileClip(video_file)
+    video_duration = int(video_clip.duration * 1000)  # 비디오의 길이를 정수로 변환
+    if len(audio_clip) > video_duration:
+        audio_clip = audio_clip[:video_duration]
+    else:
+        audio_clip += audio_clip[-1] * (video_duration - len(audio_clip))
+
+    audio_clip.export(save_audio_file, format="wav")
+    video_clip.write_videofile(save_video_file, codec='libx264', audio_codec='aac')
+
+# 4. 새로운 비디오에서 얼굴인식되는 부분중에 frame 추출 + frame에서도 256x256으로 얼굴 부분 bbox 맞춰 잘라 이미지 저장. 
+def extract_frames_with_faces(video_file, output_folder):
+    # 비디오 파일에서 프레임 추출하여 얼굴을 인식하고 이미지 저장
+    cap = cv2.VideoCapture(video_file)
+    frame_rate = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = 0
+    success, frame = cap.read()
+    while success:
+        frame_count += 1
+        if frame_count % (10 * frame_rate) == 0:  # 10초마다 프레임 추출
+            try:
+                boxes, _ = mtcnn.detect(frame)
+                if boxes is not None:
+                    for i, box in enumerate(boxes):
+                        x, y, w, h = [int(coord) for coord in box]
+                        face_image = frame[y:y+h, x:x+w]
+                        cv2.imwrite(os.path.join(output_folder, f"frame_{frame_count}_{i}.jpg"), face_image)
+            except Exception as e:
+                print(f"Failed to detect face in frame {frame_count}: {e}")
+        success, frame = cap.read()
+    cap.release()
+
+
+# Define paths
+video_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/testvideo.mp4"
+audio_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/testaudio.mp3"
+detected_voice_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/combined_voice.wav"
+output_frame_folder = "/Users/imseohyeon/Documents/voice2face-data/code/file/images"
+trimmed_video_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/trimmed_video.mp4"
+
+# Convert audio file to WAV format
+converted_audio_file_path = os.path.splitext(audio_file_path)[0] + ".wav"
+AudioSegment.from_file(audio_file_path).export(converted_audio_file_path, format="wav")
+
+# Extract audio from video
+extract_audio_from_video(video_file_path, converted_audio_file_path)
+
+# Create necessary folders
+for folder in [output_frame_folder, os.path.dirname(detected_voice_file_path)]:
+    os.makedirs(folder, exist_ok=True)
+
+# Step 2: Save the detected human voice segment and corresponding video
+save_detected_voice(converted_audio_file_path, video_file_path, detected_voice_file_path, trimmed_video_file_path)
+
+# If no human voice segments are detected, delete the corresponding video file
+if not os.path.exists(detected_voice_file_path):
+    os.remove(trimmed_video_file_path)
+else:
+    # Step 3: Extract frames with detected faces every 10 seconds
+    extract_frames_with_faces(trimmed_video_file_path, output_frame_folder)
diff --git a/data/video/video_clipimage.py b/data/video/video_clipimage.py
new file mode 100644
index 0000000..eb4e39c
--- /dev/null
+++ b/data/video/video_clipimage.py
@@ -0,0 +1,31 @@
+import cv2
+
+face_cascade = cv2.CascadeClassifier('code/file/haarcascade_frontalface_default.xml')
+
+# 이미지 읽기
+img = cv2.imread('code/file/image.png')
+gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+# 얼굴 감지
+faces = face_cascade.detectMultiScale(gray, 1.3, 5)
+
+# 각 얼굴에 대해 반복
+for (x, y, w, h) in faces:
+    # 얼굴 영역에 사각형 그리기
+    cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
+    
+    # 얼굴 영역을 256x256 크기로 조정
+    face_crop = cv2.resize(img[y:y+h, x:x+w], (256, 256))
+    
+    # 얼굴 이미지 저장
+    cv2.imwrite('code/file/face_detected_256x256.png', face_crop)
+
+# 이미지 보여주기
+cv2.imshow('Image view', img)
+
+# 'q' 키를 누를 때까지 대기
+while cv2.waitKey(0) & 0xFF != ord('q'):
+    pass
+
+cv2.destroyAllWindows()
+
diff --git a/data/video/video_download.py b/data/video/video_download.py
new file mode 100644
index 0000000..faeeb96
--- /dev/null
+++ b/data/video/video_download.py
@@ -0,0 +1,74 @@
+from pytube import YouTube  # 유튜브 영상을 다운로드하기 위한 모듈
+import os.path              # 경로를 설정하기 위한 모듈
+import ffmpeg               # 미디어를 변환하기 위한 모듈
+from getpass import getuser  # 기본 경로를 다운로드 폴더로 지정하기 위한 모듈
+
+class Download:
+    '''
+    파일을 변환하기 위해선 ffmpeg란 프로그램을 별도로 설치해 컴퓨터 환경변수 설정을 마쳐야 함.
+    '''
+    def __init__(self, link):
+        # link 인자는 GUI에서 입력된 값을 받을 때 사용
+        # 컴퓨터 이용자명을 받아서 다운로드 폴더를 기본 폴더로 지정
+        self.parent_dir = f"/Users/{getuser()}/Documents/voice2face-data/code/file"
+        self.yt = YouTube(link)
+
+    def getVideoName(self):
+        '''(GUI 버전) 비디오 이름을 내보내는 함수'''
+        name = self.yt.title
+        return name
+
+    def downloadMp3(self):
+        '''mp3 파일로 다운로드하는 함수'''
+        # mp4 형태지만 영상 없이 소리만 있는 파일 다운로드
+        stream = self.yt.streams.filter(only_audio=True).first()
+        stream.download(self.parent_dir)
+
+        src = stream.default_filename   # mp4로 다운받은 영상 제목(파일명과 같음)
+        dst = "testaudio.mp3"  # 변경된 파일명
+
+        # mp4에서 mp3로 변환
+        ffmpeg.input(os.path.join(self.parent_dir, src)).output(os.path.join(self.parent_dir, dst)).run(overwrite_output=True)
+
+        # 변환되기 전 mp4 파일 삭제
+        os.remove(os.path.join(self.parent_dir, src))
+
+        return dst  # 저장한 파일명 리턴
+
+    def downloadMp4(self):
+        '''mp4 파일로 다운로드하는 함수'''
+        audio = self.downloadMp3()   # mp3 파일 다운로드
+        video = self.yt.streams.filter(adaptive=True, file_extension='mp4').first()  # 비디오 객체 가져오기
+        print(video)
+        video.download(self.parent_dir)  # mp4 파일 다운로드
+
+        # mp4로 해상도 높은 파일을 받으면 vcodec만 존재
+        # -> 비디오에 소리를 입히려면 acodec 있는 파일 받아 FFmpeg로 병합
+        # -> downloadMp3로 mp3 파일을 받고 오디오 소스로 사용
+        inputAudio = ffmpeg.input(os.path.join(self.parent_dir, audio))
+        inputVideo = ffmpeg.input(os.path.join(self.parent_dir, video.default_filename))
+
+        # 영상에 소리 입혀 "new.mp4"파일로 내보내기
+        ffmpeg.output(inputAudio, inputVideo, os.path.join(self.parent_dir, "new.mp4"), vcodec='copy', acodec='aac').run(overwrite_output=True)
+
+        # # 변환이 끝나 더 이상 필요 없는 mp3, mp4 파일 지우기
+        # os.remove(os.path.join(self.parent_dir, video.default_filename))
+        # os.remove(os.path.join(self.parent_dir, audio))
+
+        # "new.mp4"를 영상 제목으로 바꾸기
+        os.rename(os.path.join(self.parent_dir, "new.mp4"), os.path.join(self.parent_dir, video.default_filename))
+
+        return video.default_filename   # 저장한 파일명 리턴
+
+# 위의 클래스 정의 부분을 여기에 넣어주세요.
+
+# 다운로드를 수행할 링크
+link = "https://youtu.be/2DnGKEeRB4g?si=93Cf_Mg2n53kSpGQ"
+
+# Download 클래스 인스턴스 생성
+downloader = Download(link)
+
+# mp4 파일로 다운로드
+downloaded_file = downloader.downloadMp4()
+
+print(f"다운로드가 완료되었습니다: {downloaded_file}")