From a571787248b7994e3c4dd1c7599d28bdd588822d Mon Sep 17 00:00:00 2001 From: carboxaminoo Date: Wed, 27 Mar 2024 12:28:21 +0900 Subject: [PATCH] Feat : version 1 --- data/audio/audio_check_dB.py | 52 +++++++++ data/audio/audio_crop.py | 107 ++++++++++++++++++ data/crawling/crawling_detect.py | 139 +++++++++++++++++++++++ data/crawling/crawling_rename_video.py | 30 +++++ data/crawling/crawling_select_csv.py | 42 +++++++ data/crawling/crawling_urlsave.py | 97 ++++++++++++++++ data/crawling/crawling_videosave.py | 53 +++++++++ data/image_clipseg2.py | 104 +++++++++++++++++ data/relabel/relabel_Vox_age.py | 84 ++++++++++++++ data/relabel/relabel_detect_getframe.py | 141 ++++++++++++++++++++++++ data/relabel/relabel_select_csv.py | 57 ++++++++++ data/total_audio_video_image.py | 102 +++++++++++++++++ data/video/video_clipimage.py | 31 ++++++ data/video/video_download.py | 74 +++++++++++++ 14 files changed, 1113 insertions(+) create mode 100644 data/audio/audio_check_dB.py create mode 100644 data/audio/audio_crop.py create mode 100644 data/crawling/crawling_detect.py create mode 100644 data/crawling/crawling_rename_video.py create mode 100644 data/crawling/crawling_select_csv.py create mode 100644 data/crawling/crawling_urlsave.py create mode 100644 data/crawling/crawling_videosave.py create mode 100644 data/image_clipseg2.py create mode 100644 data/relabel/relabel_Vox_age.py create mode 100644 data/relabel/relabel_detect_getframe.py create mode 100644 data/relabel/relabel_select_csv.py create mode 100644 data/total_audio_video_image.py create mode 100644 data/video/video_clipimage.py create mode 100644 data/video/video_download.py diff --git a/data/audio/audio_check_dB.py b/data/audio/audio_check_dB.py new file mode 100644 index 0000000..a1a4bda --- /dev/null +++ b/data/audio/audio_check_dB.py @@ -0,0 +1,52 @@ +import librosa +import numpy as np +import matplotlib.pyplot as plt + +''' +You can determine the minimum, maximum, and average dB values +to set a threshold for identifying voice regions based on dB levels. +After visually inspecting the waveform and setting a threshold, +adding 80 to it, you can conveniently apply this threshold value to `audio_crop.py`. +''' + +# Load audio file +audio_path = "voice2face-data/audio/input.wav" +y, sr = librosa.load(audio_path, sr=None) + +# Calculate spectrum and check maximum and minimum dB values +D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max) +max_db = np.max(D) +min_db = np.min(D) + +# Set threshold value +threshold_db = -60 + +# Consider regions with dB values above the threshold as voice regions +voice_indices = np.where(D > threshold_db) + +print("Threshold:", threshold_db) +print("Maximum dB value in regions with voice:", np.max(D[voice_indices])) +print("Minimum dB value in regions with voice:", np.min(D[voice_indices])) + +# Calculate average dB value in regions with voice +average_db = np.mean(D[voice_indices]) +print("Average dB value in regions with voice:", average_db) + +# Plot waveform and spectrum +plt.figure(figsize=(12, 6)) + +# Plot waveform +plt.subplot(2, 1, 1) +plt.plot(y) +plt.title("Waveform") +plt.xlabel("Sample") +plt.ylabel("Amplitude") + +# Plot spectrum +plt.subplot(2, 1, 2) +librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log') +plt.colorbar(format='%+2.0f dB') +plt.title('Log-frequency power spectrogram') + +plt.tight_layout() +plt.show() diff --git a/data/audio/audio_crop.py b/data/audio/audio_crop.py new file mode 100644 index 0000000..8bb3f58 --- /dev/null +++ b/data/audio/audio_crop.py @@ -0,0 +1,107 @@ +import os +import librosa +import soundfile as sf +import matplotlib.pyplot as plt +from pydub import AudioSegment + +''' +Extracts human voice segments from an audio file and creates a new audio file with the detected voice segments +within a 10-second duration. + +Args: + audio_file (str): Path to the input audio file. If the file format is .m4a, it will be converted to .wav. + +Returns: + save_file (str): Path to the saved audio file with detected voice segments. +''' + +def detect_human_voice(audio_file): + ''' + Detects human voice segments in an audio file. + + Args: + audio_file (str): Path to the input audio file. + + Returns: + voice_indices (list): List containing indices of the detected voice segments. + ''' + # Read the audio file + y, sr = librosa.load(audio_file, sr=None) + + # Detect voice activity + # ----- Need to Modify threshold-----# + voice_segments = librosa.effects.split(y, top_db=18) + + # Generate indices of voice segments + voice_indices = [] + for start, end in voice_segments: + voice_indices.extend(range(start, end)) + + return voice_indices + +def save_full_audio_with_detected_voice(audio_file, save_file): + ''' + Saves the full audio file with detected voice segments. + + Args: + audio_file (str): Path to the input audio file. + save_file (str): Path to save the audio file with detected voice segments. + ''' + # Read the entire audio file + y, sr = librosa.load(audio_file, sr=None) + + # Detect human voice segments and get their indices + voice_indices = detect_human_voice(audio_file) + + # Extract human voice segments using the indices + combined_audio = y[voice_indices] + + # Save the extracted audio segments to a file + sf.write(save_file, combined_audio, sr) + + # Visualize and save the waveform of the original and detected voice segments + plt.figure(figsize=(12, 6)) + + # Original audio waveform + plt.subplot(2, 1, 1) + plt.plot(y) + plt.title("Original Audio Waveform") + plt.xlabel("Sample") + plt.ylabel("Amplitude") + + # Waveform of detected voice segments + plt.subplot(2, 1, 2) + plt.plot(combined_audio) + plt.title("Detected Voice Waveform") + plt.xlabel("Sample") + plt.ylabel("Amplitude") + + plt.tight_layout() + save_path = os.path.join(os.path.dirname(save_file), 'result') + if not os.path.exists(save_path): + os.makedirs(save_path) + save_file_path = os.path.join(save_path, os.path.basename(save_file[:-4] + "_waveform_comparison.png")) + plt.savefig(save_file_path) + + # Save the extracted audio segments to a file + audio_save_file_path = os.path.join(save_path, os.path.basename(save_file)) + sf.write(audio_save_file_path, combined_audio, sr) + + plt.show() + +# Define paths for the original file and the file to save with detected voice segments +# ------Need to modify path------ # +audio_file_path = "voice2face-data/audio/input.m4a" +save_file_path = "voice2face-data/audio/detected_voice.wav" + +# Check if the file extension is ".m4a" for conversion and processing +if audio_file_path.endswith('.m4a'): + # Convert m4a file to wav format + wav_path = audio_file_path[:-4] + ".wav" + audio = AudioSegment.from_file(audio_file_path) + audio.export(wav_path, format="wav") + # Process the converted wav file + save_full_audio_with_detected_voice(wav_path, save_file_path) +else: + # Process the original file without conversion + save_full_audio_with_detected_voice(audio_file_path, save_file_path) diff --git a/data/crawling/crawling_detect.py b/data/crawling/crawling_detect.py new file mode 100644 index 0000000..917859f --- /dev/null +++ b/data/crawling/crawling_detect.py @@ -0,0 +1,139 @@ +import os +import pandas as pd +from moviepy.editor import VideoFileClip +import numpy as np +import face_recognition +import shutil + +''' +Detects faces and audio in video clips and refines them. + +Extracts faces from the video clips and selects segments with audio to rebuild new videos. +New videos are organized in the "processed_videos" folder. + +''' + +# Function to extract audio from video clips with detected faces +def extract_audio_with_face(video_clip, start_time, end_time): + ''' + Extracts audio from a video clip with detected faces within a specified time range. + + Args: + video_clip (VideoFileClip): Input video clip. + start_time (float): Start time of the segment containing the detected faces. + end_time (float): End time of the segment containing the detected faces. + + Returns: + audio (AudioClip): Extracted audio clip. + ''' + audio = video_clip.audio.subclip(start_time, end_time) + return audio + +# Function to extract audio from video clips with detected faces in multiple segments +def extract_audio_with_faces(video_clip, face_detections): + ''' + Extracts audio from a video clip with detected faces in multiple segments. + + Args: + video_clip (VideoFileClip): Input video clip. + face_detections (list): List of tuples containing start and end times of segments with detected faces. + + Returns: + final_audio (ndarray): Concatenated audio array from all detected face segments. + ''' + audio_clips = [] + + for start_time, end_time in face_detections: + audio_clip = extract_audio_with_face(video_clip, start_time, end_time) + audio_clips.append(audio_clip) + + final_audio = np.concatenate([clip.to_soundarray() for clip in audio_clips]) + return final_audio + +# Function to detect faces in video clips +def detect_faces(video_clip): + ''' + Detects faces in a video clip. + + Args: + video_clip (VideoFileClip): Input video clip. + + Returns: + face_detections (list): List of tuples containing start and end times of segments with detected faces. + ''' + frames = [frame for frame in video_clip.iter_frames()] + frame_rate = video_clip.fps + frame_times = np.arange(len(frames)) / frame_rate + face_detections = [] + + for i, frame in enumerate(frames): + face_locations = face_recognition.face_locations(frame) + if face_locations: + start_time = frame_times[max(0, i - 1)] + end_time = frame_times[min(len(frames) - 1, i + 1)] + face_detections.append((start_time, end_time)) + + return face_detections + +# Function to create a new video from detected face segments +def create_new_video(video_clip, face_detections, output_path): + ''' + Creates a new video from detected face segments. + + Args: + video_clip (VideoFileClip): Input video clip. + face_detections (list): List of tuples containing start and end times of segments with detected faces. + output_path (str): Path to save the new video. + ''' + new_video_clip = None + + for start_time, end_time in face_detections: + subclip = video_clip.subclip(start_time, end_time) + if new_video_clip is None: + new_video_clip = subclip + else: + new_video_clip = new_video_clip.append(subclip) + + new_video_clip.write_videofile(output_path) + +# Read data from a CSV file +csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv" +df = pd.read_csv(csv_file_path) + +# Paths for input and output folders +DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/" +NEW_FOLDER = "/Users/imseohyeon/Documents/crawling/processed_videos/" + +# Create a new folder if it doesn't exist +if not os.path.exists(NEW_FOLDER): + os.makedirs(NEW_FOLDER) + +# Process each video to extract audio from segments with detected faces and create new videos +for idx, row in df.iterrows(): + video_filename = f"{idx}_video.mp4" + video_path = os.path.join(DOWNLOAD_FOLDER, video_filename) + + if os.path.exists(video_path): + try: + video_clip = VideoFileClip(video_path) + face_detections = detect_faces(video_clip) + + if face_detections: + final_audio = extract_audio_with_faces(video_clip, face_detections) + output_path = os.path.join(NEW_FOLDER, f"{idx}_new_video.mp4") + create_new_video(video_clip, face_detections, output_path) + + print(f"Processing complete for {video_filename}") + else: + print(f"No faces detected in {video_filename}") + except Exception as e: + print(f"Error processing {video_filename}: {e}") + else: + print(f"File {video_filename} does not exist.") + +# Move processed videos to another folder +processed_files = os.listdir(NEW_FOLDER) +for file in processed_files: + shutil.move(os.path.join(NEW_FOLDER, file), DOWNLOAD_FOLDER) + +print("All videos processed") diff --git a/data/crawling/crawling_rename_video.py b/data/crawling/crawling_rename_video.py new file mode 100644 index 0000000..3c005ad --- /dev/null +++ b/data/crawling/crawling_rename_video.py @@ -0,0 +1,30 @@ +import os +import pandas as pd + +''' +Match the video names in the 'download' folder with the index in the CSV. +This facilitates the subsequent video relabeling task. +''' + +# Read links from the CSV file +csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv" +df = pd.read_csv(csv_file_path) + +# Path to the folder where downloaded videos are stored +DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/" + +# Iterate over all files in the folder and rename them +for filename in os.listdir(DOWNLOAD_FOLDER): + # Full path of the file + file_path = os.path.join(DOWNLOAD_FOLDER, filename) + # Check if the file is a .mp4 file + if filename.endswith(".mp4"): + # Extract the index value from the file name (assuming the video title is stored as the index) + idx = filename.split("_")[0] # Example: "0_video.mp4" -> "0" + # Create a new file name + new_filename = f"{idx}_video.mp4" + # Create the new file path + new_file_path = os.path.join(DOWNLOAD_FOLDER, new_filename) + # Rename the file + os.rename(file_path, new_file_path) + print(f"File renamed: {filename} -> {new_filename}") diff --git a/data/crawling/crawling_select_csv.py b/data/crawling/crawling_select_csv.py new file mode 100644 index 0000000..1bab567 --- /dev/null +++ b/data/crawling/crawling_select_csv.py @@ -0,0 +1,42 @@ +import os +import pandas as pd +from argparse import ArgumentParser + +def parse_args(): + parser = ArgumentParser() + + # Conventional args + parser.add_argument('--csv_file', type=str, default='output_test.csv') + parser.add_argument('--data_path', type=str, default='origin/video') + parser.add_argument('--save_csv', type=str, default='new_output.csv') + + args = parser.parse_args() + + return args + + +def list_files_and_folders(data_path): + if os.path.isdir(data_path): + items = os.listdir(data_path) + return items + else: + return None + +def main(csv_file, data_path, save_csv): + + csv_data = pd.read_csv(csv_file, header=None) + youtube_ids = list_files_and_folders(data_path) + + for youtube_id in youtube_ids: + filtered_df = csv_data[csv_data[0].astype(str).str.contains(youtube_id)] + first_row = filtered_df.iloc[0:1] + file_name = list_files_and_folders(os.path.join(data_path, youtube_id))[0] + file_name_list = file_name.split("_") + first_row[4] = file_name_list[0] + first_row[5] = file_name_list[1] + first_row.to_csv(save_csv, mode="a", index=False, header=False) + + +if __name__ == '__main__': + args = parse_args() + main(**args.__dict__) \ No newline at end of file diff --git a/data/crawling/crawling_urlsave.py b/data/crawling/crawling_urlsave.py new file mode 100644 index 0000000..36d9c8d --- /dev/null +++ b/data/crawling/crawling_urlsave.py @@ -0,0 +1,97 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys + +import requests +from bs4 import BeautifulSoup +import time +import pandas as pd +import os + +''' +YouTube crawling using Selenium + +Saves information of approximately 162 videos including name, title, and URL to Youtube_search_df.csv. +''' + +# Initialize WebDriver (executable_path not required as it's added to PATH) +browser = webdriver.Chrome() + +# URL to access +url = "https://youtube.com/" + +# Search keyword +keyword = "Solo Travel" + +# Scroll until the specified line +# finish_line = 40000 (about 162 videos) +finish_line = 10000 + +browser.maximize_window() +browser.get(url) +time.sleep(2) +search = browser.find_element(By.NAME, "search_query") +time.sleep(2) +search.send_keys(keyword) +search.send_keys(Keys.ENTER) + +# Switch to search result page for parsing +present_url = browser.current_url +browser.get(present_url) +last_page_height = browser.execute_script("return document.documentElement.scrollHeight") + +# Scroll 100 times +scroll_count = 0 +while scroll_count < 100: + # Scroll down + browser.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") + time.sleep(2.0) # Waiting for data to load when scrolling down + new_page_height = browser.execute_script("return document.documentElement.scrollHeight") + + # Increase scroll count + scroll_count += 1 + +html_source = browser.page_source +soup = BeautifulSoup(html_source, 'html.parser') + +# Retrieve all search results up to the finish line +# Extract all content-related sections +elem = soup.find_all("ytd-video-renderer", class_="style-scope ytd-item-section-renderer") + +# Retrieve necessary information +df = [] +for t in elem[:100]: # Retrieve only the first 100 video information + title = t.find("yt-formatted-string", class_="style-scope ytd-video-renderer").get_text() + name = t.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").get_text() + content_url = t.find("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")["href"] + df.append([name, title , 'https://www.youtube.com/'+content_url]) + +## Save data +# Create DataFrame +new = pd.DataFrame(columns=['name', 'title' , 'url_link']) + +# Insert data +for i in range(len(df)): + new.loc[i] = df[i] + +# Create directory to save data +df_dir = "./data/" +if not os.path.exists(df_dir): + os.makedirs(df_dir) + +# Save data +new.to_csv(os.path.join(df_dir, "Youtube_search_df.csv"), index=True, encoding='utf8') # Save with index + +## Save column information +# Column description table +col_names = ['name', 'title' ,'url_link'] +col_exp = ['Channel name', 'Video title', 'URL link'] + +new_exp = pd.DataFrame({'col_names':col_names, + 'col_explanation':col_exp}) + +# Save +new_exp.to_csv(os.path.join(df_dir, "Youtube_col_exp.csv"), index=False, encoding='utf8') + +# Close the browser +browser.close() diff --git a/data/crawling/crawling_videosave.py b/data/crawling/crawling_videosave.py new file mode 100644 index 0000000..8c7c422 --- /dev/null +++ b/data/crawling/crawling_videosave.py @@ -0,0 +1,53 @@ +import os +import pandas as pd +from pytube import YouTube +import time + +''' +Download videos from URLs obtained through 'crawling_urlave.py'. +Videos are saved in the 'download' folder. +''' + +# Read links from the CSV file +csv_file_path = "/Users/imseohyeon/Documents/crawling/data/Youtube_search_df.csv" +df = pd.read_csv(csv_file_path) + +# Define the download folder path +DOWNLOAD_FOLDER = "/Users/imseohyeon/Documents/crawling/download/" + +# Create the download folder if it doesn't exist +if not os.path.exists(DOWNLOAD_FOLDER): + os.makedirs(DOWNLOAD_FOLDER) + +# Iterate over each video and download +for idx, row in df.iterrows(): + video_url = row['url_link'] + try: + # Get video information using Pytube + yt = YouTube(video_url) + length_seconds = yt.length + + # Set the filename + filename = f"{idx}_video.mp4" + + # If the video length exceeds 5 minutes, download only the first 5 minutes + if length_seconds > 5 * 60: + print(f"{yt.title} video exceeds 5 minutes. Downloading only the first 5 minutes.") + stream = yt.streams.filter(adaptive=True, file_extension='mp4').first() + if stream: + print(f"Downloading: {yt.title}") + stream.download(output_path=DOWNLOAD_FOLDER, filename=filename) + print(f"{yt.title} download complete") + else: + print(f"No highest quality stream available for {yt.title}.") + else: + # Download the entire video for videos less than 5 minutes long + stream = yt.streams.get_highest_resolution() + if stream: + print(f"Downloading: {yt.title}") + stream.download(output_path=DOWNLOAD_FOLDER, filename=filename) + print(f"{yt.title} download complete") + else: + print(f"No highest quality stream available for {yt.title}.") + except Exception as e: + print(f"Failed to download {yt.title}: {e}") diff --git a/data/image_clipseg2.py b/data/image_clipseg2.py new file mode 100644 index 0000000..26f9521 --- /dev/null +++ b/data/image_clipseg2.py @@ -0,0 +1,104 @@ +from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation +from PIL import Image +import torch +import numpy as np + +''' +This script performs image segmentation using the CLIPSeg model based on provided text prompts. It loads an image, processes it with text prompts, and generates a segmented image based on the identified objects. + +Inputs: +- image: The image to be segmented. +- positive_prompts: Text prompts describing the objects to be identified, separated by commas. +- negative_prompts: Text prompts describing the objects to be ignored, separated by commas. +- threshold: Threshold value for segmentation, between 0 and 1. + +Outputs: +- output_image: Segmented image with identified objects highlighted. +- final_mask: Final mask representing the segmented areas. + +''' + +# load CLIPSeg model & processor +processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") +model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") + +# image path & load +image_path = "/root/project/voice2face-data/file/face_detected_256x256.png" +image = Image.open(image_path) + +def process_image(image, positive_prompts, negative_prompts, threshold): + ''' + This function performs image segmentation based on provided text prompts and threshold. + + Args: + - image: PIL image object. + - positive_prompts: Text prompts describing the objects to be identified, separated by commas. + - negative_prompts: Text prompts describing the objects to be ignored, separated by commas. + - threshold: Threshold value for segmentation, between 0 and 1. + + Returns: + - output_image: Segmented image with identified objects highlighted. + - final_mask: Final mask representing the segmented areas. + ''' + + # image segmentation with img & prompt + def get_masks(prompts, img, threshold): + prompts = prompts.split(",") + masks = [] + for prompt in prompts: + inputs = processor( + text=prompt.strip(), images=image, padding="max_length", return_tensors="pt" + ) + with torch.no_grad(): + outputs = model(**inputs) + preds = outputs.logits + + pred = torch.sigmoid(preds) + mat = pred.cpu().numpy() + mask = Image.fromarray(np.uint8(mat * 255), "L") + mask = mask.convert("RGB") + mask = mask.resize(image.size) + mask = np.array(mask)[:, :, 0] + + # normalize the mask + mask_min = mask.min() + mask_max = mask.max() + mask = (mask - mask_min) / (mask_max - mask_min) + mask = mask > threshold + masks.append(mask) + return masks + + # Make mask's Positive prompts, Negative prompts + positive_masks = get_masks(positive_prompts, image, threshold) + negative_masks = get_masks(negative_prompts, image, threshold) + + # Make Result mask combined masks + pos_mask = np.any(np.stack(positive_masks), axis=0) + neg_mask = np.any(np.stack(negative_masks), axis=0) + final_mask = pos_mask & ~neg_mask + + # Result image + final_mask = Image.fromarray(final_mask.astype(np.uint8) * 255, "L") + output_image = Image.new("RGBA", image.size, (0, 0, 0, 0)) + output_image.paste(image, mask=final_mask) + return output_image, final_mask + +# base prompt +positive_prompts = "face" +negative_prompts = "background" +threshold = 0.5 + +# 텍스트 프롬프트 및 임계값 설정 +# positive_prompts = input("what you want to identify (comma separated): ") +# negative_prompts = input("what you want to ignore (comma separated): ") +# threshold = float(input("enter the threshold value (between 0 and 1): ")) + +# process of segmentation +output_image, final_mask = process_image(image, positive_prompts, negative_prompts, threshold) + +# save result img +output_image_path = "/root/project/voice2face-data/file/segmented_image.png" +output_image.save(output_image_path) + +# print success message +print("Segmented image saved successfully at:", output_image_path) diff --git a/data/relabel/relabel_Vox_age.py b/data/relabel/relabel_Vox_age.py new file mode 100644 index 0000000..842afa7 --- /dev/null +++ b/data/relabel/relabel_Vox_age.py @@ -0,0 +1,84 @@ +import cv2 +import argparse +import os +import csv +from collections import Counter + +def predict_age(face): + ''' + Function to predict age from a face image. + + Args: + face: Image to predict age from. + + Returns: + age: Predicted age group index. + ''' + blob = cv2.dnn.blobFromImage(face, 1.0, (227, 227), MODEL_MEAN_VALUES, swapRB=False) + ageNet.setInput(blob) + agePreds = ageNet.forward() + age = agePreds[0].argmax() + return age + +def count_and_print_age(folder_path): + ''' + Function to count and print the most common age group from images in a folder. + + Args: + folder_path: Path to the folder containing images. + + Returns: + most_common_age: Index of the most common age group. + ''' + age_list = [] + try: + for filename in os.listdir(folder_path): + if filename.endswith(".jpg"): + image_path = os.path.join(folder_path, filename) + frame = cv2.imread(image_path) + if frame is not None: + age = predict_age(frame) + age_list.append(age) + except FileNotFoundError: + print(f"Folder '{folder_path}' not found.") + return None + + if age_list: + most_common_age = Counter(age_list).most_common(1)[0][0] + print("Most common age group index:", most_common_age) + return most_common_age + else: + print("No images detected in the folder.") + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--folder', required=True, help='Path to the folder containing images.') + parser.add_argument('--csv_source', default="/home/carbox/Desktop/git/dataset/vox1/vox1_meta.csv") + parser.add_argument('--output_csv', default="/home/carbox/Desktop/git/dataset/test/csv/test.csv") + args = parser.parse_args() + + ageProto = "weights/age_deploy.prototxt" + ageModel = "weights/age_net.caffemodel" + + MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746) + ageList = ['0', '1', '2', '3', '4', '5', '6', '7'] # Index 형태로 출력하기 위해 수정 + + ageNet = cv2.dnn.readNet(ageModel, ageProto) + + with open (args.output_csv, 'w', newline='') as output_csv: + csvwriter = csv.writer(output_csv, delimiter='\t') + + with open(args.csv_source, 'r') as file: + for idx, line in enumerate(file): + if idx == 0: + csvwriter.writerow(line.strip().split("\t") + ["age"]) # 헤더 부분에 age를 추가하여 리스트를 연결 + continue + line = line.strip() # Remove leading/trailing whitespaces + if line: + fields = line.split("\t") + image_name = fields[1] + age_index = count_and_print_age(os.path.join(args.folder, image_name)) + if age_index is not None: + fields.append(str(age_index)) # age_index를 문자열로 변환하여 fields에 추가 + csvwriter.writerow(fields) diff --git a/data/relabel/relabel_detect_getframe.py b/data/relabel/relabel_detect_getframe.py new file mode 100644 index 0000000..73915b4 --- /dev/null +++ b/data/relabel/relabel_detect_getframe.py @@ -0,0 +1,141 @@ +import cv2 +import math +import argparse +import os + +def highlightFace(net, frame, conf_threshold=0.7): + ''' + Function to detect faces in a frame using a pre-trained deep learning model. + + Args: + net: Pre-trained deep learning model. + frame: Input frame. + conf_threshold: Confidence threshold for face detection. + + Returns: + frameOpencvDnn: Copy of the input frame with face rectangles drawn. + faceInfo: List containing information about detected faces (bbox, center, width, height). + ''' + frameOpencvDnn = frame.copy() + frameHeight = frameOpencvDnn.shape[0] + frameWidth = frameOpencvDnn.shape[1] + blob = cv2.dnn.blobFromImage(frameOpencvDnn, 1.0, (300, 300), [104, 117, 123], True, False) + + net.setInput(blob) + detections = net.forward() + faceInfo = [] + for i in range(detections.shape[2]): + confidence = detections[0, 0, i, 2] + if confidence > conf_threshold: + x1 = int(detections[0, 0, i, 3] * frameWidth) + y1 = int(detections[0, 0, i, 4] * frameHeight) + x2 = int(detections[0, 0, i, 5] * frameWidth) + y2 = int(detections[0, 0, i, 6] * frameHeight) + cx = (x1 + x2) // 2 + cy = (y1 + y2) // 2 + w = x2 - x1 + h = y2 - y1 + faceInfo.append({'bbox': (x1, y1, x2, y2), 'center': (cx, cy), 'width': w, 'height': h}) + cv2.rectangle(frameOpencvDnn, (x1, y1), (x2, y2), (0, 255, 0), int(round(frameHeight / 150)), 8) + return frameOpencvDnn, faceInfo + +def save_frame(frame, output_folder, frame_count, folder_name, gender, age, center): + ''' + Function to save a frame with a specific filename format. + + Args: + frame: Frame to be saved. + output_folder: Folder where frames will be saved. + frame_count: Frame count. + folder_name: Name of the folder containing the video. + gender: Gender of the detected face. + age: Age range of the detected face. + center: Center coordinates of the detected face bbox. + ''' + if not os.path.exists(output_folder): + os.makedirs(output_folder) + subfolder_path = os.path.join(output_folder, folder_name) + if not os.path.exists(subfolder_path): + os.makedirs(subfolder_path) + cv2.imwrite(os.path.join(subfolder_path, f"{gender}_{age}_{center[0]}-{center[1]}.jpg"), frame) + +parser = argparse.ArgumentParser() +parser.add_argument('--folder', default='/Users/imseohyeon/Documents/gad/video') +parser.add_argument('--output_folder', default='frame') +parser.add_argument('--capture_interval', type=int, default=50) # Adjusted frame interval for capturing + +args = parser.parse_args() + +faceProto = "opencv_face_detector.pbtxt" +faceModel = "opencv_face_detector_uint8.pb" +ageProto = "age_deploy.prototxt" +ageModel = "age_net.caffemodel" +genderProto = "gender_deploy.prototxt" +genderModel = "gender_net.caffemodel" + +MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746) +ageList = ['(0-2)', '(4-6)', '(8-12)', '(15-20)', '(25-32)', '(38-43)', '(48-53)', '(60-100)'] +genderList = ['Male', 'Female'] + +faceNet = cv2.dnn.readNet(faceModel, faceProto) +ageNet = cv2.dnn.readNet(ageModel, ageProto) +genderNet = cv2.dnn.readNet(genderModel, genderProto) + +for root, dirs, files in os.walk(args.folder): + for folder_name in dirs: + folder_path = os.path.join(root, folder_name) + for filename in os.listdir(folder_path): + if filename.endswith(".mp4"): + video_path = os.path.join(folder_path, filename) + break + else: + continue + + video = cv2.VideoCapture(video_path) + padding = 20 + frame_count = 0 + while True: # Infinite loop for processing each frame + hasFrame, frame = video.read() + if not hasFrame: + break + + resultImg, faceInfo = highlightFace(faceNet, frame) + if faceInfo: # Process only if faces are detected + for faceData in faceInfo: + bbox = faceData['bbox'] + center = faceData['center'] + width = faceData['width'] + height = faceData['height'] + + face = frame[max(0, bbox[1] - padding): min(bbox[3] + padding, frame.shape[0] - 1), + max(0, bbox[0] - padding): min(bbox[2] + padding, frame.shape[1] - 1)] + + blob = cv2.dnn.blobFromImage(face, 1.0, (227, 227), MODEL_MEAN_VALUES, swapRB=False) + genderNet.setInput(blob) + genderPreds = genderNet.forward() + gender = genderList[genderPreds[0].argmax()] + print(f'Gender: {gender}') + + ageNet.setInput(blob) + agePreds = ageNet.forward() + age = ageList[agePreds[0].argmax()] + print(f'Age: {age[1:-1]} years') + + cv2.putText(resultImg, f'{gender}, {age}', (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA) + cv2.putText(resultImg, f'Center: ({center[0]}, {center[1]})', (bbox[0], bbox[1] - 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA) + cv2.putText(resultImg, f'Box: ({bbox[0]}, {bbox[1]}, {width}, {height})', (bbox[0], bbox[1] - 70), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, cv2.LINE_AA) + + cv2.imshow("Detecting age and gender", resultImg) + + # Capture and save frames at specified intervals + if frame_count % args.capture_interval == 0: + save_frame(frame, args.output_folder, frame_count, folder_name, gender, age, center) + + frame_count += 1 + + # Press 'q' to exit + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + video.release() + cv2.destroyAllWindows() diff --git a/data/relabel/relabel_select_csv.py b/data/relabel/relabel_select_csv.py new file mode 100644 index 0000000..0ece279 --- /dev/null +++ b/data/relabel/relabel_select_csv.py @@ -0,0 +1,57 @@ +import os +import pandas as pd +from argparse import ArgumentParser + +def parse_args(): + parser = ArgumentParser() + + # Conventional args + parser.add_argument('--csv_file', type=str, default='output_test.csv') + parser.add_argument('--data_path', type=str, default='origin/video') + parser.add_argument('--save_csv', type=str, default='new_output.csv') + + args = parser.parse_args() + + return args + + +def list_files_and_folders(data_path): + if os.path.isdir(data_path): + items = os.listdir(data_path) + return items + else: + return None + +def main(csv_file, data_path, save_csv): + ageList_s = [0,4,8,15,25,38,48,60] + + csv_data = pd.read_csv(csv_file, header=None) + youtube_ids = list_files_and_folders(data_path) + count = 0 + for youtube_id in youtube_ids: + filtered_df = csv_data[csv_data[0].astype(str).str.contains(youtube_id)] + first_row = filtered_df.iloc[0:1] + file_name = list_files_and_folders(os.path.join(data_path, youtube_id)) + for i in file_name: + file_name_list = i.split("_") + if file_name_list[-1] =="Store": + continue + else: + print(youtube_id, file_name_list) + + first_row[4] = file_name_list[0] + + first_row[5] = ageList_s.index(int(file_name_list[1])) + + first_row.to_csv(save_csv, mode="a", index=False, header=False) + + + count += 1 + + + print(count) + + +if __name__ == '__main__': + args = parse_args() + main(**args.__dict__) \ No newline at end of file diff --git a/data/total_audio_video_image.py b/data/total_audio_video_image.py new file mode 100644 index 0000000..0b6384a --- /dev/null +++ b/data/total_audio_video_image.py @@ -0,0 +1,102 @@ +import os +import numpy as np +import librosa +import soundfile as sf +import matplotlib.pyplot as plt +from pydub import AudioSegment +import cv2 +from facenet_pytorch import MTCNN +import subprocess + +# Initialize MTCNN for face detection +mtcnn = MTCNN() + +from moviepy.editor import VideoFileClip + +# 1. 비디오에서 음성 추출 +def extract_audio_from_video(video_file, audio_file): + # ffmpeg를 사용하여 비디오에서 오디오 추출 + command = f"ffmpeg -i {video_file} -vn -acodec pcm_s16le -ar 44100 -ac 2 {audio_file}" + subprocess.call(command, shell=True) + +# 2. 사람 음성부분 추출 +def detect_human_voice(audio_file): + # 오디오 파일에서 사람 음성부분을 감지하여 해당 인덱스 반환 + y, sr = librosa.load(audio_file, sr=None) + voice_segments = librosa.effects.split(y, top_db=18) + voice_indices = [] + for start, end in voice_segments: + if end - start >= sr * 1: # 1초 이상인 경우에만 추가 + voice_indices.extend(range(start, end)) + return voice_indices + +# 3. 음성부분만 모아 다시 저장. + 비디오도 이 간격 맞춰 다시 저장 +def save_detected_voice(audio_file, video_file, save_audio_file, save_video_file): + # 감지된 사람 음성부분을 추출하여 저장 + y, sr = librosa.load(audio_file, sr=None) + voice_indices = detect_human_voice(audio_file) + combined_audio = y[voice_indices] + sf.write(save_audio_file, combined_audio, sr) + + # 비디오도 해당 음성에 맞게 잘라서 저장 + audio_clip = AudioSegment.from_wav(save_audio_file) + video_clip = VideoFileClip(video_file) + video_duration = int(video_clip.duration * 1000) # 비디오의 길이를 정수로 변환 + if len(audio_clip) > video_duration: + audio_clip = audio_clip[:video_duration] + else: + audio_clip += audio_clip[-1] * (video_duration - len(audio_clip)) + + audio_clip.export(save_audio_file, format="wav") + video_clip.write_videofile(save_video_file, codec='libx264', audio_codec='aac') + +# 4. 새로운 비디오에서 얼굴인식되는 부분중에 frame 추출 + frame에서도 256x256으로 얼굴 부분 bbox 맞춰 잘라 이미지 저장. +def extract_frames_with_faces(video_file, output_folder): + # 비디오 파일에서 프레임 추출하여 얼굴을 인식하고 이미지 저장 + cap = cv2.VideoCapture(video_file) + frame_rate = cap.get(cv2.CAP_PROP_FPS) + frame_count = 0 + success, frame = cap.read() + while success: + frame_count += 1 + if frame_count % (10 * frame_rate) == 0: # 10초마다 프레임 추출 + try: + boxes, _ = mtcnn.detect(frame) + if boxes is not None: + for i, box in enumerate(boxes): + x, y, w, h = [int(coord) for coord in box] + face_image = frame[y:y+h, x:x+w] + cv2.imwrite(os.path.join(output_folder, f"frame_{frame_count}_{i}.jpg"), face_image) + except Exception as e: + print(f"Failed to detect face in frame {frame_count}: {e}") + success, frame = cap.read() + cap.release() + + +# Define paths +video_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/testvideo.mp4" +audio_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/testaudio.mp3" +detected_voice_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/combined_voice.wav" +output_frame_folder = "/Users/imseohyeon/Documents/voice2face-data/code/file/images" +trimmed_video_file_path = "/Users/imseohyeon/Documents/voice2face-data/code/file/trimmed_video.mp4" + +# Convert audio file to WAV format +converted_audio_file_path = os.path.splitext(audio_file_path)[0] + ".wav" +AudioSegment.from_file(audio_file_path).export(converted_audio_file_path, format="wav") + +# Extract audio from video +extract_audio_from_video(video_file_path, converted_audio_file_path) + +# Create necessary folders +for folder in [output_frame_folder, os.path.dirname(detected_voice_file_path)]: + os.makedirs(folder, exist_ok=True) + +# Step 2: Save the detected human voice segment and corresponding video +save_detected_voice(converted_audio_file_path, video_file_path, detected_voice_file_path, trimmed_video_file_path) + +# If no human voice segments are detected, delete the corresponding video file +if not os.path.exists(detected_voice_file_path): + os.remove(trimmed_video_file_path) +else: + # Step 3: Extract frames with detected faces every 10 seconds + extract_frames_with_faces(trimmed_video_file_path, output_frame_folder) diff --git a/data/video/video_clipimage.py b/data/video/video_clipimage.py new file mode 100644 index 0000000..eb4e39c --- /dev/null +++ b/data/video/video_clipimage.py @@ -0,0 +1,31 @@ +import cv2 + +face_cascade = cv2.CascadeClassifier('code/file/haarcascade_frontalface_default.xml') + +# 이미지 읽기 +img = cv2.imread('code/file/image.png') +gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + +# 얼굴 감지 +faces = face_cascade.detectMultiScale(gray, 1.3, 5) + +# 각 얼굴에 대해 반복 +for (x, y, w, h) in faces: + # 얼굴 영역에 사각형 그리기 + cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2) + + # 얼굴 영역을 256x256 크기로 조정 + face_crop = cv2.resize(img[y:y+h, x:x+w], (256, 256)) + + # 얼굴 이미지 저장 + cv2.imwrite('code/file/face_detected_256x256.png', face_crop) + +# 이미지 보여주기 +cv2.imshow('Image view', img) + +# 'q' 키를 누를 때까지 대기 +while cv2.waitKey(0) & 0xFF != ord('q'): + pass + +cv2.destroyAllWindows() + diff --git a/data/video/video_download.py b/data/video/video_download.py new file mode 100644 index 0000000..faeeb96 --- /dev/null +++ b/data/video/video_download.py @@ -0,0 +1,74 @@ +from pytube import YouTube # 유튜브 영상을 다운로드하기 위한 모듈 +import os.path # 경로를 설정하기 위한 모듈 +import ffmpeg # 미디어를 변환하기 위한 모듈 +from getpass import getuser # 기본 경로를 다운로드 폴더로 지정하기 위한 모듈 + +class Download: + ''' + 파일을 변환하기 위해선 ffmpeg란 프로그램을 별도로 설치해 컴퓨터 환경변수 설정을 마쳐야 함. + ''' + def __init__(self, link): + # link 인자는 GUI에서 입력된 값을 받을 때 사용 + # 컴퓨터 이용자명을 받아서 다운로드 폴더를 기본 폴더로 지정 + self.parent_dir = f"/Users/{getuser()}/Documents/voice2face-data/code/file" + self.yt = YouTube(link) + + def getVideoName(self): + '''(GUI 버전) 비디오 이름을 내보내는 함수''' + name = self.yt.title + return name + + def downloadMp3(self): + '''mp3 파일로 다운로드하는 함수''' + # mp4 형태지만 영상 없이 소리만 있는 파일 다운로드 + stream = self.yt.streams.filter(only_audio=True).first() + stream.download(self.parent_dir) + + src = stream.default_filename # mp4로 다운받은 영상 제목(파일명과 같음) + dst = "testaudio.mp3" # 변경된 파일명 + + # mp4에서 mp3로 변환 + ffmpeg.input(os.path.join(self.parent_dir, src)).output(os.path.join(self.parent_dir, dst)).run(overwrite_output=True) + + # 변환되기 전 mp4 파일 삭제 + os.remove(os.path.join(self.parent_dir, src)) + + return dst # 저장한 파일명 리턴 + + def downloadMp4(self): + '''mp4 파일로 다운로드하는 함수''' + audio = self.downloadMp3() # mp3 파일 다운로드 + video = self.yt.streams.filter(adaptive=True, file_extension='mp4').first() # 비디오 객체 가져오기 + print(video) + video.download(self.parent_dir) # mp4 파일 다운로드 + + # mp4로 해상도 높은 파일을 받으면 vcodec만 존재 + # -> 비디오에 소리를 입히려면 acodec 있는 파일 받아 FFmpeg로 병합 + # -> downloadMp3로 mp3 파일을 받고 오디오 소스로 사용 + inputAudio = ffmpeg.input(os.path.join(self.parent_dir, audio)) + inputVideo = ffmpeg.input(os.path.join(self.parent_dir, video.default_filename)) + + # 영상에 소리 입혀 "new.mp4"파일로 내보내기 + ffmpeg.output(inputAudio, inputVideo, os.path.join(self.parent_dir, "new.mp4"), vcodec='copy', acodec='aac').run(overwrite_output=True) + + # # 변환이 끝나 더 이상 필요 없는 mp3, mp4 파일 지우기 + # os.remove(os.path.join(self.parent_dir, video.default_filename)) + # os.remove(os.path.join(self.parent_dir, audio)) + + # "new.mp4"를 영상 제목으로 바꾸기 + os.rename(os.path.join(self.parent_dir, "new.mp4"), os.path.join(self.parent_dir, video.default_filename)) + + return video.default_filename # 저장한 파일명 리턴 + +# 위의 클래스 정의 부분을 여기에 넣어주세요. + +# 다운로드를 수행할 링크 +link = "https://youtu.be/2DnGKEeRB4g?si=93Cf_Mg2n53kSpGQ" + +# Download 클래스 인스턴스 생성 +downloader = Download(link) + +# mp4 파일로 다운로드 +downloaded_file = downloader.downloadMp4() + +print(f"다운로드가 완료되었습니다: {downloaded_file}")