Merge branch 'docs/esp_sr' into 'master'

Docs/esp sr See merge request speech-recognition-framework/esp-sr!139
espressif · Feb 14, 2025 · f38167f · f38167f
2 parents f4245e1 + 27a234d
commit f38167f
Show file tree

Hide file tree

Showing 52 changed files with 699 additions and 228 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -111,7 +111,9 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
 
     component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
     add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a")
+    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a")
     target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
+    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)
 
 elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
 #Only support TTS on esp32s2, esp32c3 and esp32c6

diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst
@@ -78,7 +78,7 @@ Using the AFE Framework
 
 Based on the ``menuconfig`` -> ``ESP Speech Recognition``, select the required AFE (Analog Front End) models, such as the WakeNet model, VAD (Voice Activity Detection) model, NS (Noise Suppression) model, etc., and then call the AFE framework in the code using the following steps.
 
-For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp`.
+For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp` or `esp-skainet/examples <https://github.com/espressif/esp-skainet/tree/master/examples>`__.
 
 Step 1: Initialize AFE Configuration
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/en/audio_front_end/migration_guide.rst b/docs/en/audio_front_end/migration_guide.rst
@@ -6,11 +6,12 @@ Migration from V1.* to V2.*
 Configuration and Initialization
 --------------------------------
 
-- 1. The legacy configuration initialization method AFE_CONFIG_DEFAULT() has been removed. Please use ``afe_config_init`` to initialize configurations. Modifications can still be made after initialization:
+- 1. The legacy configuration initialization method AFE_CONFIG_DEFAULT() has been removed. Please use ``afe_config_init`` to initialize configurations:
 
    .. code-block:: c
 
       afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
+      afe_config_print(afe_config); // print all configurations
 
 - 2. ESP_AFE_SR_HANDLE and ESP_AFE_VC_HANDLE have been removed. Use ``esp_afe_handle_from_config`` to create instances:
 

diff --git a/docs/en/vadnet/README.rst b/docs/en/vadnet/README.rst
@@ -19,6 +19,7 @@ Use VADNet
 -  Select VADNet model
 
     ::
+        
         idf.py menuconfig
         ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium).
 
@@ -44,7 +45,7 @@ Use VADNet
         afe_handle->enable_vad(afe_data);   // enable VADNet
         afe_handle->reset_vad(afe_data);    // reset VADNet status
 
-- VAD Cache and Detection
+- VAD Cache
 
     There are two issues in the VAD settings that can cause a delay in the first frame trigger of speech.
 

diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst
@@ -73,7 +73,7 @@ AFE 声学前端算法框架
 使用AFE框架
 ----------------------------
 根据 ``menuconfig`` -> ``ESP Speech Recognition`` 选择需要的AFE的模型，比如WakeNet模型，VAD模型， NS模型等，然后在代码中使用以下步骤调用AFE框架。
-代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp`。
+代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp` 或是 or `esp-skainet/examples <https://github.com/espressif/esp-skainet/tree/master/examples>`__.。
 
 步骤1：初始化AFE配置
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/zh_CN/audio_front_end/migration_guide.rst b/docs/zh_CN/audio_front_end/migration_guide.rst
@@ -7,11 +7,12 @@ rstCopy
 配置和初始化
 --------------------------------
 
-- 1. 旧的配置初始化方法 AFE_CONFIG_DEFAULT() 已被移除。请使用 ``afe_config_init`` 来初始化配置。初始化后仍可进行修改：
+- 1. 旧的配置初始化方法 AFE_CONFIG_DEFAULT() 已被移除。请使用 ``afe_config_init`` 来初始化配置：
 
    .. code-block:: c
 
       afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
+      afe_config_print(afe_config); // print all configurations
 
 - 2. ESP_AFE_SR_HANDLE 和 ESP_AFE_VC_HANDLE 已被移除。使用 ``esp_afe_handle_from_config`` 来创建实例：
 

diff --git a/docs/zh_CN/vadnet/README.rst b/docs/zh_CN/vadnet/README.rst
@@ -18,6 +18,7 @@ VADNet 训练数据包括了大约5000小时中文数据， 5000 小时英文数
 -  选择VADNet模型
 
     ::
+        
         idf.py menuconfig
         ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium).
 

diff --git a/idf_component.yml b/idf_component.yml
@@ -1,4 +1,4 @@
-version: "2.0.0~1-rc.3"
+version: "2.0.0"
 description: esp_sr provides basic algorithms for Speech Recognition applications
 url: https://github.com/espressif/esp-sr
 dependencies:

diff --git a/include/esp32/esp_afe_aec.h b/include/esp32/esp_afe_aec.h
@@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h
@@ -110,6 +110,8 @@ typedef struct {
     int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
     int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
                             // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
     bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
     bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
 

diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h
@@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
 typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
 
 /**
- * @brief Enable VAD algorithm.
+ * @brief Reset one function/module/algorithm.
  *
  * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
+ * @return             -1: fail, 1: success
  */
-typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
 
 /**
  * @brief Disable one function/module/algorithm.
@@ -204,6 +204,7 @@ typedef struct {
     esp_afe_sr_iface_op_enable_func_t enable_se;
     esp_afe_sr_iface_op_disable_func_t disable_vad;
     esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
     esp_afe_sr_iface_op_disable_func_t disable_ns;
     esp_afe_sr_iface_op_enable_func_t enable_ns;
     esp_afe_sr_iface_op_disable_func_t disable_agc;

diff --git a/include/esp32/esp_mfcc_iface.h b/include/esp32/esp_mfcc_iface.h
@@ -1,40 +1,37 @@
 #pragma once
-#include <stdint.h>
 #include "esp_speech_features.h"
+#include <stdint.h>
 
 /*
 This describes an interface for a MFCC runner, that is, some kind of implementation that can be
 fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
 multiple implementations can be used.
 */
 
-
 typedef struct esp_mfcc_data_t esp_mfcc_data_t;
 
-
-//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
-//refer to its documentation for details.
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
 typedef struct {
-    int winstep_ms;     // The step between successive windows in ms. (10)
-    int winlen_ms;      // The length of the analysis window in ms. (25)
-    int nch;            // The number of input channel
-    int numcep;         // The number of cepstrum to return
-    int nfilter;        // The number of filters in the filterbank
-    int nfft;           // The FFT size
-    int samp_freq;      // The sample-rate of the signal.
-    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
-    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
-    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
-    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
-    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
     bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
     int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
-    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
     bool psram_first;   // Alloc memory from PSRAM first
-    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
 } esp_mfcc_opts_t;
 
-
 /**
  * @brief Un-initialize and free a mfcc runner
  *
@@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
  * @param opt Options for the mfcc process
  * @return True if success, false on error.
  */
-typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
 
 /**
  * @brief Run a mfcc iteration on frame by frame
  *
  * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
- * an initial call to this function may return NULL and subsequent calls may return the 
+ * an initial call to this function may return NULL and subsequent calls may return the
  * cepstrum of previous calls.
  *
  * @param r The mfcc runner
@@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
  *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
  *         to this function is done.
  */
-typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
 
 /**
  * @brief Clean all state of mfcc handle

diff --git a/include/esp32/esp_mfcc_models.h b/include/esp32/esp_mfcc_models.h
@@ -1,18 +1,16 @@
 #pragma once
 #include "esp_mfcc_iface.h"
 
-
 extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
 
-
 /**
  * @brief Return basic opts used in wakenet9 & multinet5
  **/
 esp_mfcc_opts_t *get_mfcc_opts_wn9();
 
 /**
- * @brief Return basic opts for default kaldifeat 
- * 
+ * @brief Return basic opts for default kaldifeat
+ *
     opts->psram_first = true;
     opts->use_power = true;
     opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
@@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
 /**
  * @brief Print mfcc opts
  **/
-void print_mfcc_opts(esp_mfcc_opts_t *opts);
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
diff --git a/include/esp32/esp_speech_features.h b/include/esp32/esp_speech_features.h
@@ -8,57 +8,55 @@
 #define M_2PI 6.283185307179586476925286766559005
 #endif
 
-typedef struct 
-{
+typedef struct {
     float *coeff;
     int *bank_pos;
     int nfilter;
 } esp_mel_filter_t;
 
-float* esp_mfcc_malloc(size_t size, bool from_psram);
+float *esp_mfcc_malloc(size_t size, bool from_psram);
 
 void esp_mfcc_free(void *ptr);
 
 /**
  * @brief Initialize FFT table
  * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
- * @param nfft  The input samples number 
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
  * @return fft-table
  **/
-void* esp_fft_init(int nfft);
+void *esp_fft_init(int nfft);
 
 /**
  * @brief Free FFT table
  * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
+ *          For Other platform, use kiss fft
+ *
  * @param fft_table  The fft table initialized by esp_fft_init
- * @param nfft       The input samples number 
+ * @param nfft       The input samples number
  * @return fft-table
  **/
 void esp_fft_deinit(void *fft_table, int nfft);
 
 /**
  * @brief Initial window function
- *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        Currently support hanning, hamming, sine, povey, rectangular,
  *        wn9(512-hanning to get wakenet9& multinet5 compatible)
  **/
-float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
 
-float* esp_fftr(float* x, int nfft, void *fft_table);
+float *esp_fftr(float *x, int nfft, void *fft_table);
 
 float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
 
 void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
 
 float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
 
-esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
-                                      bool from_psram);
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
 
 void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
 
-float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
-                                float epsilon);
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
-Original file line number
+Diff line change
@@ Expand Up @@
     -  选择VADNet模型
         ::
             idf.py menuconfig
             ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium).
@@ Expand Down @@