anliyuan · DavidLi8910 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Nov 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ syncnet_checkpoint
 *.jpg
 data_utils/encoder.onnx
 __MACOSX
+/datasets/
diff --git a/README.md b/README.md
@@ -32,12 +32,7 @@ conda create -n dh python=3.10
 conda activate dh
 conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
 conda install mkl=2024.0
-pip install opencv-python
-pip install transformers
-pip install numpy==1.23.5
-pip install soundfile
-pip install librosa
-pip install onnxruntime
+pip install -r requirements.txt
 ```
 
 I only ran on pytorch==1.13.1, Other versions should also work.
@@ -56,6 +51,8 @@ Prepare your video, 3~5min is good. Make sure that every frame of the video has
 
 First of all, we need to extract audio feature.I'm using 2 different extractor from wenet and hubert, thank them for their great work.
 
+wenet的代码和与训练模型来自:https://github.com/Tzenthin/wenet_mnn
+
 首先我们需要提取音频特征，我用了两个不同的特征提取起，分别是wenet和hubert，感谢他们。
 
 When you using wenet, you neet to ensure that your video frame rate is 20, and for hubert,your video frame rate should be 25.
@@ -90,24 +87,22 @@ Train a syncnet first for better results.
 先训练一个syncnet，效果会更好。
 
 ``` bash
-cd ..
-python syncnet.py --save_dir ./syncnet_ckpt/ --dataset_dir ./data_dir/ --asr hubert
+python train_syncnet_model.py --save_dir ./checkpoint/syncnet_ckpt/ --dataset_dir ./datasets/ --epochs 200 --batchsize 16 --num_workers 4 --lr 0.001 --asr hubert
 ```
 
 Then find a best one（low loss） to train digital human model.
 
 然后找一个loss最低的checkpoint来训练数字人模型。
 
 ``` bash
-cd ..
-python train.py --dataset_dir ./data_dir/ --save_dir ./checkpoint/ --asr hubert --use_syncnet --syncnet_checkpoint syncnet_ckpt
+python train_render_model.py --dataset_dir ./datasets/ --save_dir ./checkpoint/render_ckpt/ --epochs 200 --batchsize 16 --lr 0.001 --asr hubert --use_syncnet --syncnet_checkpoint ./checkpoint/syncnet.pth
 ```
 
 ## inference
 
-Before run inference, you need to extract test audio feature(i will merge this step and inference step), run this
+Before run inference, you need to extract test audio feature(i will merge this step and inference step), run this(The following is no longer necessary and has been merged with inference)
 
-在推理之前，需要先提取测试音频的特征（之后会把这步和推理合并到一起去），运行
+在推理之前，需要先提取测试音频的特征（之后会把这步和推理合并到一起去），运行（以下已不需要，已与推理合并）
 
 ``` bash
 python data_utils/hubert.py --wav your_test_audio.wav  # when using hubert
@@ -121,7 +116,7 @@ then you get your_test_audio_hu.npy or your_test_audio_wenet.npy
 
 then run
 ``` bash
-python inference.py --asr hubert --dataset ./your_data_dir/ --audio_feat your_test_audio_hu.npy --save_path xxx.mp4 --checkpoint your_trained_ckpt.pth
+python inference.py --asr hubert --dataset ./your_data_dir/ --wav your_test_audio_hu.wav --save_path xxx.mp4 --checkpoint your_trained_ckpt.pth
 ```
 
 To merge the audio and the video, run
@@ -143,11 +138,3 @@ if you have some advice, open an issue or PR.
 If you think this repo is useful to you, please give me a star.
 
 如果你觉的这个repo对你有用的话，记得给我点个star
-
-BUY ME A CUP OF COFFE⬇️⬇️⬇️
-<table>
-  <tr>
-    <td><img src="demo/15bef5a6d08434c0d70f0ba39bb14fc0.JPG" width="180"/></td>
-    <td><img src="demo/36d2896f13bee68247de6ccc89b17a94.JPG" width="180"/></td>
-  </tr>
-</table>
diff --git a/data_utils/get_landmark.py b/data_utils/get_landmark.py
@@ -1,5 +1,5 @@
 import argparse
-from os import wait3
+# from os import wait3
 
 import numpy as np
 import cv2

diff --git a/data_utils/hubert.py b/data_utils/hubert.py
@@ -1,25 +1,30 @@
-from transformers import Wav2Vec2Processor, HubertModel
+import argparse
 import soundfile as sf
 import numpy as np
 import torch
+import librosa
+
+from transformers import Wav2Vec2Processor, HubertModel
 
 print("Loading the Wav2Vec2 Processor...")
 wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
 print("Loading the HuBERT Model...")
 hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
 
+
 def get_hubert_from_16k_wav(wav_16k_name):
     speech_16k, _ = sf.read(wav_16k_name)
     hubert = get_hubert_from_16k_speech(speech_16k)
     return hubert
 
+
 @torch.no_grad()
 def get_hubert_from_16k_speech(speech, device="cuda:0"):
     global hubert_model
     hubert_model = hubert_model.to(device)
-    if speech.ndim ==2:
-        speech = speech[:, 0] # [T, 2] ==> [T,]
-    input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values # [1, T]
+    if speech.ndim == 2:
+        speech = speech[:, 0]  # [T, 2] ==> [T,]
+    input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values  # [1, T]
     input_values_all = input_values_all.to(device)
     # For long audio sequence, due to the memory limitation, we cannot process them in one run
     # HuBERT process the wav with a CNN of stride [5,2,2,2,2,2], making a stride of 320
@@ -32,7 +37,7 @@ def get_hubert_from_16k_speech(speech, device="cuda:0"):
     stride = 320
     clip_length = stride * 1000
     num_iter = input_values_all.shape[1] // clip_length
-    expected_T = (input_values_all.shape[1] - (kernel-stride)) // stride
+    expected_T = (input_values_all.shape[1] - (kernel - stride)) // stride
     res_lst = []
     for i in range(num_iter):
         if i == 0:
@@ -42,50 +47,47 @@ def get_hubert_from_16k_speech(speech, device="cuda:0"):
             start_idx = clip_length * i
             end_idx = start_idx + (clip_length - stride + kernel)
         input_values = input_values_all[:, start_idx: end_idx]
-        hidden_states = hubert_model.forward(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        hidden_states = hubert_model.forward(input_values).last_hidden_state  # [B=1, T=pts//320, hid=1024]
         res_lst.append(hidden_states[0])
     if num_iter > 0:
         input_values = input_values_all[:, clip_length * num_iter:]
     else:
         input_values = input_values_all
     # if input_values.shape[1] != 0:
-    if input_values.shape[1] >= kernel: # if the last batch is shorter than kernel_size, skip it            
-        hidden_states = hubert_model(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+    if input_values.shape[1] >= kernel:  # if the last batch is shorter than kernel_size, skip it
+        hidden_states = hubert_model(input_values).last_hidden_state  # [B=1, T=pts//320, hid=1024]
         res_lst.append(hidden_states[0])
-    ret = torch.cat(res_lst, dim=0).cpu() # [T, 1024]
+    ret = torch.cat(res_lst, dim=0).cpu()  # [T, 1024]
     # assert ret.shape[0] == expected_T
     assert abs(ret.shape[0] - expected_T) <= 1
     if ret.shape[0] < expected_T:
-        ret = torch.nn.functional.pad(ret, (0,0,0,expected_T-ret.shape[0]))
+        ret = torch.nn.functional.pad(ret, (0, 0, 0, expected_T - ret.shape[0]))
     else:
         ret = ret[:expected_T]
     return ret
 
+
 def make_even_first_dim(tensor):
     size = list(tensor.size())
     if size[0] % 2 == 1:
         size[0] -= 1
         return tensor[:size[0]]
     return tensor
 
-import soundfile as sf
-import numpy as np
-import torch
-from argparse import ArgumentParser
-import librosa
 
-parser = ArgumentParser()
-parser.add_argument('--wav', type=str, help='')
-args = parser.parse_args()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='hubert', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--wav', type=str, help='')
+    args = parser.parse_args()
 
-wav_name = args.wav
+    wav_name = args.wav
 
-speech, sr = sf.read(wav_name)
-speech_16k = librosa.resample(speech, orig_sr=sr, target_sr=16000)
-print("SR: {} to {}".format(sr, 16000))
-# print(speech.shape, speech_16k.shape)
+    speech, sr = sf.read(wav_name)
+    speech_16k = librosa.resample(speech, orig_sr=sr, target_sr=16000)
+    print("SR: {} to {}".format(sr, 16000))
+    # print(speech.shape, speech_16k.shape)
 
-hubert_hidden = get_hubert_from_16k_speech(speech_16k)
-hubert_hidden = make_even_first_dim(hubert_hidden).reshape(-1, 2, 1024)
-np.save(wav_name.replace('.wav', '_hu.npy'), hubert_hidden.detach().numpy())
-print(hubert_hidden.detach().numpy().shape)
+    hubert_hidden = get_hubert_from_16k_speech(speech_16k)
+    hubert_hidden = make_even_first_dim(hubert_hidden).reshape(-1, 2, 1024)
+    np.save(wav_name.replace('.wav', '_hu.npy'), hubert_hidden.detach().numpy())
+    print(hubert_hidden.detach().numpy().shape)
diff --git a/data_utils/process.py b/data_utils/process.py
@@ -2,21 +2,15 @@
 import cv2
 import argparse
 import numpy as np
+from tqdm import tqdm
 
 def extract_audio(path, out_path, sample_rate=16000):
-
     print(f'[INFO] ===== extract audio from {path} to {out_path} =====')
     cmd = f'ffmpeg -i {path} -f wav -ar {sample_rate} {out_path}'
     os.system(cmd)
     print(f'[INFO] ===== extracted audio =====')
 
-def extract_images(path, mode):
-
-
-    full_body_dir = path.replace(path.split("/")[-1], "full_body_img")
-    if not os.path.exists(full_body_dir):
-        os.mkdir(full_body_dir)
-
+def extract_images(path, full_body_dir, mode):
     counter = 0
     cap = cv2.VideoCapture(path)
     fps = cap.get(cv2.CAP_PROP_FPS)
@@ -30,26 +24,24 @@ def extract_images(path, mode):
         ret, frame = cap.read()
         if not ret:
             break
-        cv2.imwrite(full_body_dir+"/"+str(counter)+'.jpg', frame)
+        cv2.imwrite(os.path.join(full_body_dir, str(counter)+'.jpg'), frame)
         counter += 1
 
 def get_audio_feature(wav_path, mode):
-
     print("extracting audio feature...")
 
     if mode == "wenet":
         os.system("python wenet_infer.py "+wav_path)
     if mode == "hubert":
         os.system("python hubert.py --wav "+wav_path)
 
-def get_landmark(path, landmarks_dir):
+def get_landmark(full_img_dir, landmarks_dir):
     print("detecting landmarks...")
-    full_img_dir = path.replace(path.split("/")[-1], "full_body_img")
-
+
     from get_landmark import Landmark
     landmark = Landmark()
 
-    for img_name in os.listdir(full_img_dir):
+    for img_name in tqdm(os.listdir(full_img_dir)):
         if not img_name.endswith(".jpg"):
             continue
         img_path = os.path.join(full_img_dir, img_name)
@@ -68,18 +60,25 @@ def get_landmark(path, landmarks_dir):
     parser = argparse.ArgumentParser()
     parser.add_argument('path', type=str, help="path to video file")
     parser.add_argument('--asr', type=str, default='hubert', help="wenet or hubert")
+    parser.add_argument('--device_id', type=int, default=0, help="gpu id")
     opt = parser.parse_args()
     asr_mode = opt.asr
 
+    print('Using gpu id: {}'.format(opt.device_id))
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(opt.device_id)
+
     base_dir = os.path.dirname(opt.path)
     wav_path = os.path.join(base_dir, 'aud.wav')
-    landmarks_dir = os.path.join(base_dir, 'landmarks')
 
+    full_body_dir = os.path.join(base_dir, "full_body_img")
+    os.makedirs(full_body_dir, exist_ok=True)
+
+    landmarks_dir = os.path.join(base_dir, 'landmarks')
     os.makedirs(landmarks_dir, exist_ok=True)
 
     extract_audio(opt.path, wav_path)
-    extract_images(opt.path, asr_mode)
-    get_landmark(opt.path, landmarks_dir)
+    extract_images(opt.path, full_body_dir, asr_mode)
+    get_landmark(full_body_dir, landmarks_dir)
     get_audio_feature(wav_path, asr_mode)
 
 
diff --git a/data_utils/wenet_infer.py b/data_utils/wenet_infer.py
@@ -1,14 +1,17 @@
 import numpy as np
 import yaml
 import time
-from collections import defaultdict
+import argparse
+import soundfile as sf
 import torch
+import onnxruntime as ort
+
+from collections import defaultdict
 from FeaturePipeline import Feature_Pipeline
 from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
                                 remove_duplicates_and_blank, th_accuracy,
                                 reverse_pad_list)
 from torch.nn.utils.rnn import pad_sequence
-import onnxruntime as ort
 
 frames_stride = 67
 
@@ -469,30 +472,26 @@ def to_numpy(tensor):
 
 
 if __name__ == '__main__':
-    import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument('audio_path', type=str)
     opt = parser.parse_args()
-
-    import time
 
     audio_path = opt.audio_path
-
-
+
     with open('conf/decode_engine_V4.yaml', 'r') as fin:
         configs = yaml.load(fin, Loader=yaml.FullLoader)
-    
+
     asr = ASR_Model(configs)
-
-
+
     # with open(audio_path, 'rb') as f:
     #     audio_byte = f.read()
     # waveform = np.frombuffer(audio_byte, dtype=np.int16)
 
-
-    import soundfile as sf
     stream, sample_rate = sf.read(audio_path) # [T*sample_rate,] float64
-
+
+    if stream.ndim == 2:
+        stream = stream[:, 0]
+
     # stream = stream[:,0]
     waveform = stream.astype(np.float32)*32767
     waveform = waveform.astype(np.int16)
@@ -502,7 +501,7 @@ def to_numpy(tensor):
     wav_duration = len(waveform)/16000 #self.configs['engine_sample_rate_hertz']
     waveform = torch.from_numpy(waveform).float().unsqueeze(0)
     print("waveform shape", waveform.shape)
-    
+
     t1 = time.time()
     waveform_feat, feat_length = asr.feat_pipeline._extract_feature(waveform)
     print(waveform_feat.shape)
@@ -513,7 +512,7 @@ def to_numpy(tensor):
     # print(waveform_feat.size())
     # asd
     #assert 0==1
-        
+
     #encoder_model = F.load_as_dict("/data/kzx/work/MNN/build/MNN_Models/encoder_encoder.mnn")
     #encoder_in_chunk = encoder_model['chunk']
     #encoder_in_offset = encoder_model['offset']