diff --git a/CMakeLists.txt b/CMakeLists.txt index 530c6b4..830c8a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,9 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5") component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a") + add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a") target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor) + target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end) elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6")) #Only support TTS on esp32s2, esp32c3 and esp32c6 diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst index eadd4c8..8f091be 100644 --- a/docs/en/audio_front_end/README.rst +++ b/docs/en/audio_front_end/README.rst @@ -78,7 +78,7 @@ Using the AFE Framework Based on the ``menuconfig`` -> ``ESP Speech Recognition``, select the required AFE (Analog Front End) models, such as the WakeNet model, VAD (Voice Activity Detection) model, NS (Noise Suppression) model, etc., and then call the AFE framework in the code using the following steps. -For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp`. +For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp` or `esp-skainet/examples `__. Step 1: Initialize AFE Configuration ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/en/audio_front_end/migration_guide.rst b/docs/en/audio_front_end/migration_guide.rst index 6a84b47..5bdc0fe 100644 --- a/docs/en/audio_front_end/migration_guide.rst +++ b/docs/en/audio_front_end/migration_guide.rst @@ -6,11 +6,12 @@ Migration from V1.* to V2.* Configuration and Initialization -------------------------------- -- 1. The legacy configuration initialization method AFE_CONFIG_DEFAULT() has been removed. Please use ``afe_config_init`` to initialize configurations. Modifications can still be made after initialization: +- 1. The legacy configuration initialization method AFE_CONFIG_DEFAULT() has been removed. Please use ``afe_config_init`` to initialize configurations: .. code-block:: c afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + afe_config_print(afe_config); // print all configurations - 2. ESP_AFE_SR_HANDLE and ESP_AFE_VC_HANDLE have been removed. Use ``esp_afe_handle_from_config`` to create instances: diff --git a/docs/en/vadnet/README.rst b/docs/en/vadnet/README.rst index afd2194..41a430b 100644 --- a/docs/en/vadnet/README.rst +++ b/docs/en/vadnet/README.rst @@ -19,6 +19,7 @@ Use VADNet - Select VADNet model :: + idf.py menuconfig ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium). @@ -44,7 +45,7 @@ Use VADNet afe_handle->enable_vad(afe_data); // enable VADNet afe_handle->reset_vad(afe_data); // reset VADNet status -- VAD Cache and Detection +- VAD Cache There are two issues in the VAD settings that can cause a delay in the first frame trigger of speech. diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst index 83dc8ff..0decc43 100644 --- a/docs/zh_CN/audio_front_end/README.rst +++ b/docs/zh_CN/audio_front_end/README.rst @@ -73,7 +73,7 @@ AFE 声学前端算法框架 使用AFE框架 ---------------------------- 根据 ``menuconfig`` -> ``ESP Speech Recognition`` 选择需要的AFE的模型,比如WakeNet模型,VAD模型, NS模型等,然后在代码中使用以下步骤调用AFE框架。 -代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp`。 +代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp` 或是 or `esp-skainet/examples `__.。 步骤1:初始化AFE配置 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/zh_CN/audio_front_end/migration_guide.rst b/docs/zh_CN/audio_front_end/migration_guide.rst index c51b9f1..e248d75 100644 --- a/docs/zh_CN/audio_front_end/migration_guide.rst +++ b/docs/zh_CN/audio_front_end/migration_guide.rst @@ -7,11 +7,12 @@ rstCopy 配置和初始化 -------------------------------- -- 1. 旧的配置初始化方法 AFE_CONFIG_DEFAULT() 已被移除。请使用 ``afe_config_init`` 来初始化配置。初始化后仍可进行修改: +- 1. 旧的配置初始化方法 AFE_CONFIG_DEFAULT() 已被移除。请使用 ``afe_config_init`` 来初始化配置: .. code-block:: c afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + afe_config_print(afe_config); // print all configurations - 2. ESP_AFE_SR_HANDLE 和 ESP_AFE_VC_HANDLE 已被移除。使用 ``esp_afe_handle_from_config`` 来创建实例: diff --git a/docs/zh_CN/vadnet/README.rst b/docs/zh_CN/vadnet/README.rst index ec6535c..7416a09 100644 --- a/docs/zh_CN/vadnet/README.rst +++ b/docs/zh_CN/vadnet/README.rst @@ -18,6 +18,7 @@ VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数 - 选择VADNet模型 :: + idf.py menuconfig ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium). diff --git a/idf_component.yml b/idf_component.yml index d8586a3..11ecdd9 100644 --- a/idf_component.yml +++ b/idf_component.yml @@ -1,4 +1,4 @@ -version: "2.0.0~1-rc.3" +version: "2.0.0" description: esp_sr provides basic algorithms for Speech Recognition applications url: https://github.com/espressif/esp-sr dependencies: diff --git a/include/esp32/esp_afe_aec.h b/include/esp32/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 16906bd..00ac15b 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -110,6 +110,8 @@ typedef struct { int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: // 1000 ms + int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms + // If you find vad cache can not cover all speech, please increase this value. bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index 580eed9..ffc6ce2 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** - * @brief Enable VAD algorithm. + * @brief Reset one function/module/algorithm. * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled + * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe); /** * @brief Disable one function/module/algorithm. @@ -204,6 +204,7 @@ typedef struct { esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_reset_op_t reset_vad; esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_disable_func_t disable_agc; diff --git a/include/esp32/esp_mfcc_iface.h b/include/esp32/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32/esp_mfcc_iface.h +++ b/include/esp32/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32/esp_mfcc_models.h b/include/esp32/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32/esp_mfcc_models.h +++ b/include/esp32/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32/esp_speech_features.h b/include/esp32/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32/esp_speech_features.h +++ b/include/esp32/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32/esp_vad.h b/include/esp32/esp_vad.h index f3c5dd4..7e0b144 100644 --- a/include/esp32/esp_vad.h +++ b/include/esp32/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -138,6 +137,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, */ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); +/** + * @brief Reset trigger state as Silence + * + * @param handle The instance of VAD. + */ +void vad_reset_trigger(vad_handle_t handle); + /** * @brief Free the VAD instance * @@ -149,20 +155,21 @@ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/include/esp32c5/esp_afe_aec.h b/include/esp32c5/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32c5/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32c5/esp_afe_config.h b/include/esp32c5/esp_afe_config.h new file mode 100644 index 0000000..f9de6fe --- /dev/null +++ b/include/esp32c5/esp_afe_config.h @@ -0,0 +1,69 @@ +#pragma once +#include "esp_aec.h" +#include "stdbool.h" +#include "stdint.h" +#include "stdlib.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// AFE: Audio Front-End +// SR: Speech Recognition +// VC: Voice Communication + +// Set AFE_SR mode +typedef enum { + SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode +} afe_sr_mode_t; + +// Set AFE mode +typedef enum { + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode +} afe_mode_t; + +// Set AFE type +typedef enum { + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression +} afe_type_t; + +typedef enum { + AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram + AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance + AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram +} afe_memory_alloc_mode_t; + +typedef enum { + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain +} afe_mn_peak_agc_mode_t; + +typedef struct { + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t *mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t *ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio +} afe_pcm_config_t; + +typedef enum { + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET +} afe_ns_mode_t; + +typedef enum { + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated +} afe_agc_mode_t; + + +#ifdef __cplusplus +} +#endif + diff --git a/include/esp32p4/esp_afe_aec.h b/include/esp32p4/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32p4/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32p4/esp_mfcc_iface.h b/include/esp32p4/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32p4/esp_mfcc_iface.h +++ b/include/esp32p4/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32p4/esp_mfcc_models.h b/include/esp32p4/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32p4/esp_mfcc_models.h +++ b/include/esp32p4/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32p4/esp_speech_features.h b/include/esp32p4/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32p4/esp_speech_features.h +++ b/include/esp32p4/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32p4/esp_vad.h b/include/esp32p4/esp_vad.h index 0c7f734..7e0b144 100644 --- a/include/esp32p4/esp_vad.h +++ b/include/esp32p4/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/include/esp32s3/esp_afe_aec.h b/include/esp32s3/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32s3/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32s3/esp_mfcc_iface.h b/include/esp32s3/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32s3/esp_mfcc_iface.h +++ b/include/esp32s3/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32s3/esp_mfcc_models.h b/include/esp32s3/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32s3/esp_mfcc_models.h +++ b/include/esp32s3/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32s3/esp_speech_features.h b/include/esp32s3/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32s3/esp_speech_features.h +++ b/include/esp32s3/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index 0c7f734..7e0b144 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index f3c0381..8cc82ff 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index 3c2b9d7..f8e42cb 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index bfaa367..6dc8766 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index d5c18d9..8d154c4 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index 5b90657..7222a23 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32c5/libesp_audio_front_end.a b/lib/esp32c5/libesp_audio_front_end.a new file mode 100644 index 0000000..450f76c Binary files /dev/null and b/lib/esp32c5/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libc_speech_features.a b/lib/esp32p4/libc_speech_features.a index dee49de..c913ab1 100644 Binary files a/lib/esp32p4/libc_speech_features.a and b/lib/esp32p4/libc_speech_features.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index cce5d29..07aca44 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index f32dd0f..b8b7cc0 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 16dca4f..18c8743 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index 8c3424e..5d1b4f6 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index 6eba4cd..6e78226 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 3c4f69c..924d26d 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index f27412e..fb42317 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 4089104..885518c 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index d113daf..6676ed4 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index 6a99a57..44fbb79 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index a2dd373..3164c92 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index c0465b1..b9751f5 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index 62e7576..5f81799 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index f396b67..44c9c88 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index 533e9bd..bd2a26d 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 0ff21b1..bb79b70 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/test_apps/esp-sr/main/test_afe.cpp b/test_apps/esp-sr/main/test_afe.cpp index ff20efe..edc5ace 100644 --- a/test_apps/esp-sr/main/test_afe.cpp +++ b/test_apps/esp-sr/main/test_afe.cpp @@ -18,6 +18,7 @@ #include "esp_wn_models.h" #include "esp_afe_sr_models.h" #include "dl_lib_convq_queue.h" +#include "esp_afe_aec.h" #include #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) @@ -297,4 +298,25 @@ TEST_CASE("afe performance test (2ch)", "[afe_perf]") afe_config_free(afe_config); } esp_srmodel_deinit(models); +} + + +TEST_CASE("test afe aec interface", "[afe]") +{ + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + + afe_aec_handle_t *handle = afe_aec_create("MNR", 4, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + int frame_bytes = handle->frame_size * sizeof(int16_t); + int16_t *indata = (int16_t *) malloc(frame_bytes*handle->pcm_config.total_ch_num); + int16_t *outdata = (int16_t *) malloc(frame_bytes); + + afe_aec_process(handle, indata, outdata); + afe_aec_process(handle, indata, outdata); + afe_aec_process(handle, indata, outdata); + + afe_aec_destroy(handle); + free(indata); + free(outdata); + int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + TEST_ASSERT_EQUAL(true, end_size == start_size); } \ No newline at end of file diff --git a/test_apps/esp32c5/main/test_aec.cpp b/test_apps/esp32c5/main/test_aec.cpp index df1a937..ee18a0b 100644 --- a/test_apps/esp32c5/main/test_aec.cpp +++ b/test_apps/esp32c5/main/test_aec.cpp @@ -12,10 +12,64 @@ #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "esp_aec.h" +#include "esp_afe_aec.h" #include "audio_test_file.h" #include "unity.h" #include "esp_timer.h" + +TEST_CASE("test esp32c5 afe aec interface", "[aec]") +{ + // vad_handle_t vad_handle = (vad_handle_t)arg; + heap_caps_print_heap_info(MALLOC_CAP_8BIT); + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + int sample_rate = 16000; + + afe_aec_handle_t *aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST); + afe_aec_destroy(aec_handle); + int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + printf("memory leak for first init: %d\n", start_size - first_end_size); + + aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST); + int audio_chunksize = afe_aec_get_chunksize(aec_handle); + printf("audio chunksize:%d\n", audio_chunksize); //512 + int16_t *buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)*2); + int16_t *out_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)); + + int chunks = 0; + uint32_t c0, c1, c_res = 0; + while (1) { + if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(audio_mic_file)) { + memcpy(buffer, audio_mic_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t)); + memcpy(buffer+audio_chunksize, audio_ref_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t)); + } else { + break; + } + + c0 = esp_timer_get_time(); + afe_aec_process(aec_handle, buffer, out_buffer); + c1 = esp_timer_get_time(); + + c_res += c1 - c0; + chunks++; + } + + free(buffer); + free(out_buffer); + printf("RAM size after vad detection: total:%d, internal:%d\n", + start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT), + start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL)); + printf("Done! Took %ld ms to parse %d ms worth of samples in %d iterations.\n", + c_res/1000, chunks*audio_chunksize*1000/sample_rate, chunks); + afe_aec_destroy(aec_handle); + + int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + printf("memory leak:%d\n", start_size-end_size); + TEST_ASSERT_EQUAL(true, end_size == start_size); +} + + TEST_CASE("test esp32c5 aec", "[aec]") { // vad_handle_t vad_handle = (vad_handle_t)arg; @@ -68,3 +122,6 @@ TEST_CASE("test esp32c5 aec", "[aec]") printf("memory leak:%d\n", start_size-end_size); TEST_ASSERT_EQUAL(true, end_size == start_size); } + + +