diff --git a/.env b/.env
index e152f994b..8d349a073 100644
--- a/.env
+++ b/.env
@@ -12,6 +12,12 @@ PUBLIC_SERVICE_PORT=8081
DOCKER_BUILDKIT=1
COMPOSE_DOCKER_CLI_BUILD=1
+# ONNX_MODEL_FOLDER_PATH specifies the directory where ONNX models are stored.
+# These models are loaded dynamically at runtime. The path is set relative to
+# the project root, allowing for consistent model loading across different
+# deployment environments.
+ONNX_MODEL_FOLDER_PATH=${PWD}/pkg/component/resources/onnx
+
# test
# TEST_DBHOST and TEST_DBNAME are used to initialize a separate database for
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index ad681058e..3dde06506 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -85,11 +85,9 @@ will create and migrate a test database to keep these queries isolated from the
main DB. You can set the database host and name by overriding the `TEST_DBHOST`
and `TEST_DBNAME` values.
-Certain tests depend on the [`docconv`](https://github.com/sajari/docconv)
-package and aren't run by default. You can trigger them by adding the `OCR=true`
-flag to the coverage command. Make sure to install the [package
-dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies)
-first.
+Certain tests depend on external packages and aren't run by default:
+- For [`docconv`](https://github.com/sajari/docconv) tests, add `OCR=true` flag and install its [dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies).
+- For [`onnxruntime`](https://github.com/microsoft/onnxruntime) tests, add `ONNX=true` flag. Follow the [guideline](#set-up-onnx-runtime) to set up ONNX Runtime (Linux only).
#### Run the integration tests
@@ -111,6 +109,22 @@ If empty, tests will try to connect to `localhost:5432`.
$ make rm
```
+### Set up ONNX Runtime (Linux only)
+
+1. Download the latest [ONNX Runtime release](https://github.com/microsoft/onnxruntime/releases) for your system.
+
+2. Install ONNX Runtime:
+ ```bash
+ sudo mkdir -p /usr/local/onnxruntime
+ sudo tar -xzf onnxruntime-*-*-*.tgz -C /usr/local/onnxruntime --strip-components=1
+ export ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+ export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+ export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+ export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include
+ ```
+
+**Note:** If you don't have sudo access, extract to a user-writeable location (e.g., `~/onnxruntime`), set `ONNXRUNTIME_ROOT_PATH` accordingly, and adjust the environment variables as shown above. No need to create symlinks in this case.
+
## Codebase contribution
### Pre-commit hooks
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index d4a85df7e..2c2c710b6 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -38,10 +38,23 @@ jobs:
- uses: actions/checkout@v3
+ - name: Install onnxruntime library and headers
+ run: |
+ export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
+ LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
+ ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
+ wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
+ rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+ echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+ echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
+
- name: Generate coverage report
run: |
make build-dev
- make coverage DBTEST=true OCR=true
+ make coverage DBTEST=true OCR=true ONNX=true
- name: Upload coverage report
uses: codecov/codecov-action@v2
diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml
index 2483e2547..0f3f6372b 100644
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@@ -21,8 +21,24 @@ jobs:
with:
go-version: ${{ env.GOLANG_VERSION }}
cache: false
+ - name: Install sorx
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y libsoxr-dev
+ - name: Install onnxruntime library and headers
+ run: |
+ export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
+ LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
+ ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
+ wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
+ rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+ echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
+ echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+ echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
- version: v1.59
- args: --timeout=10m
+ version: v1.61
+ args: --timeout=10m --build-tags onnx
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9faea3c61..3de7cbc40 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,6 +12,7 @@ repos:
rev: v0.5.1
hooks:
- id: golangci-lint
+ args: ["--build-tags", "onnx"]
- id: go-mod-tidy
- repo: https://github.com/pinglin/conventional-pre-commit
rev: v1.1.0
diff --git a/Dockerfile b/Dockerfile
index 21088b6d6..12f91bbdf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,33 @@
ARG GOLANG_VERSION=1.22.5
-FROM golang:${GOLANG_VERSION}-alpine3.19 AS build
-
-RUN apk add --no-cache build-base leptonica-dev tesseract-ocr-dev musl-dev
+FROM golang:${GOLANG_VERSION}-bullseye AS build
+
+ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION
+
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ libleptonica-dev \
+ libtesseract-dev \
+ libsoxr-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install ONNX Runtime (latest release)
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+RUN apt update && \
+ apt install -y wget jq && \
+ LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
+ ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
+ wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
+ rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ apt remove -y wget jq && \
+ apt autoremove -y && \
+ rm -rf /var/lib/apt/lists/*
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
WORKDIR /src
@@ -9,51 +35,28 @@ COPY go.mod go.sum ./
RUN go mod download
COPY . .
-RUN go get github.com/otiai10/gosseract/v2
-
ARG SERVICE_NAME TARGETOS TARGETARCH
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=ocr,onnx -o /${SERVICE_NAME} ./cmd/main
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-migrate ./cmd/migration
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init ./cmd/init
+
+FROM debian:bullseye-slim
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME} ./cmd/main
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME}-worker ./cmd/worker
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-migrate ./cmd/migration
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-init ./cmd/init
-
-FROM alpine:3.19
-
-RUN apk add --no-cache \
- curl \
- poppler-utils \
- wv \
- tidyhtml \
- libc6-compat \
- tesseract-ocr \
- python3 \
- py3-pip \
- build-base \
- python3-dev \
- libffi-dev \
- libreoffice \
- qpdf \
- msttcorefonts-installer \
- font-noto \
- font-noto-cjk \
- ffmpeg \
- chromium \
- && update-ms-fonts \
- && fc-cache -f \
- && python3 -m venv /opt/venv \
- && /opt/venv/bin/pip install --upgrade pip \
- && /opt/venv/bin/pip install pdfplumber tokenizers \
- && rm -rf /var/cache/apk/* /var/cache/fontconfig/*
-
-# Download tesseract data
-RUN curl -L https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata \
- -o /usr/share/tessdata/eng.traineddata
-
-ARG TARGETARCH
-ARG BUILDARCH
-RUN apk add unrtf --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community
+# Install Python, create virtual environment, and install pdfplumber
+RUN apt update && \
+ apt install -y curl python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
+ python3 -m venv /opt/venv && \
+ /opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
+ rm -rf /var/lib/apt/lists/*
+# copy ONNX runtime from build stage
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_ROOT_PATH}
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/
USER nobody:nogroup
@@ -71,3 +74,7 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-migrate ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
+
+# Set up ONNX model and environment variable
+COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
+ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
diff --git a/Dockerfile.dev b/Dockerfile.dev
index a34325e65..6e5987dc0 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -1,5 +1,5 @@
ARG GOLANG_VERSION=1.22.5
-FROM golang:${GOLANG_VERSION}
+FROM golang:${GOLANG_VERSION}-bullseye
ARG SERVICE_NAME
@@ -11,18 +11,39 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION
# Install Python, create virtual environment, and install pdfplumber
RUN apt update && \
- apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg chromium qpdf && \
+ apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
rm -rf /var/lib/apt/lists/*
+# Install ONNX Runtime (latest release)
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+RUN apt update && \
+ apt install -y wget jq && \
+ LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
+ ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
+ wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
+ rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+ apt remove -y wget jq && \
+ apt autoremove -y && \
+ rm -rf /var/lib/apt/lists/*
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+
+# tparse
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/mfridman/tparse@latest
+
# air
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/cosmtrek/air@v1.49
# k6
-RUN go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
-RUN go install github.com/mfridman/tparse@v0.15.0
-RUN xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6
# -- set up Go
@@ -38,6 +59,10 @@ ENV GOENV=/go/.config/go/env
# required to restore compatibility with those versions.
ENV GODEBUG=tlsrsakex=1
+# Set up ONNX model and environment variable
+COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
+ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
+
USER nobody:nogroup
ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/Makefile b/Makefile
index 7e92334d0..a4827beb2 100644
--- a/Makefile
+++ b/Makefile
@@ -7,13 +7,6 @@ include .env
export
GOTEST_FLAGS := CFG_DATABASE_HOST=${TEST_DBHOST} CFG_DATABASE_NAME=${TEST_DBNAME}
-ifeq (${DBTEST}, true)
- GOTEST_TAGS := -tags=dbtest
-endif
-ifeq (${OCR}, true)
- GOTEST_TAGS := -tags=ocr
-endif
-
#============================================================================
@@ -40,10 +33,10 @@ latest: ## Run latest container
echo "Run latest container ${SERVICE_NAME} and ${SERVICE_NAME}-worker. To stop it, run \"make stop\"."
@docker run --network=instill-network \
--name ${SERVICE_NAME} \
- -d ${SERVICE_NAME}:latest ./${SERVICE_NAME}
+ -d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}
@docker run --network=instill-network \
--name ${SERVICE_NAME}-worker \
- -d ${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker
+ -d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker
.PHONY: rm
rm: ## Remove all running containers
@@ -60,10 +53,10 @@ build-dev: ## Build dev docker image
.PHONY: build-latest
build-latest: ## Build latest docker image
- @docker buildx build \
+ @docker build \
--build-arg GOLANG_VERSION=${GOLANG_VERSION} \
--build-arg SERVICE_NAME=${SERVICE_NAME} \
- -t pipeline-backend:latest .
+ -t instill/pipeline-backend:latest .
.PHONY: go-gen
go-gen: ## Generate codes
@@ -94,40 +87,32 @@ coverage:
rm coverage.out; \
fi
+# Tests should run in container without local tparse installation.
+# If you encounter container test issues, install tparse locally:
+# go install github.com/mfridman/tparse/cmd/tparse@latest
.PHONY: test
test:
-# Ideally, it should be ok to run without installing tparse locally.
-# However, there may be some issues that arise from running the tests
-# in the container. If you encounter any issues, please install tparse
-# locally via `go install github.com/mfridman/tparse/cmd/tparse@latest`
-# and run the tests locally.
- @if [ "${OCR}" = "true" ]; then \
- docker run --rm \
- -v $(PWD):/${SERVICE_NAME} \
- --user $(id -u):$(id -g) \
- --entrypoint= \
- instill/${SERVICE_NAME}:dev \
- make test-ocr; \
+ @TAGS=""; \
+ if [ "$${OCR}" = "true" ]; then \
+ TAGS="$$TAGS,ocr"; \
+ [ "$$(uname)" = "Darwin" ] && export TESSDATA_PREFIX=$$(dirname $$(brew list tesseract | grep share/tessdata/eng.traineddata)); \
+ fi; \
+ if [ "$${ONNX}" = "true" ]; then \
+ if [ "$$(uname)" = "Darwin" ]; then \
+ echo "ONNX Runtime test is not supported on Darwin (macOS)."; \
+ else \
+ TAGS="$$TAGS,onnx"; \
+ fi; \
+ fi; \
+ TAGS=$${TAGS#,}; \
+ if [ -n "$$TAGS" ]; then \
+ echo "Running tests with tags: $$TAGS"; \
+ go test -v -tags="$$TAGS" ./... -json | tparse --notests --all; \
else \
- docker run --rm \
- -v $(PWD):/${SERVICE_NAME} \
- --user $(id -u):$(id -g) \
- --entrypoint= \
- instill/${SERVICE_NAME}:dev \
- go test -v ./... -json | tparse --notests --all; \
+ echo "Running standard tests"; \
+ go test -v ./... -json | tparse --notests --all; \
fi
-.PHONY: test-ocr
-test-ocr:
-# Certain component tests require additional dependencies.
-# Install tesseract via `brew install tesseract`
-# Setup `export LIBRARY_PATH="/opt/homebrew/lib"` `export CPATH="/opt/homebrew/include"`
-ifeq ($(shell uname), Darwin)
- @TESSDATA_PREFIX=$(shell dirname $(shell brew list tesseract | grep share/tessdata/eng.traineddata)) ${GOTEST_FLAGS} go test -v ./... -json | tparse --notests --all
-else
- @echo "This target can only be executed on Darwin (macOS)."
-endif
-
.PHONY: integration-test
integration-test: ## Run integration test
@ # DB_HOST points to localhost by default. Override this variable if
diff --git a/go.mod b/go.mod
index 70b7d8c3a..5c1adb8f8 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
github.com/frankban/quicktest v1.14.6
github.com/gabriel-vasile/mimetype v1.4.3
github.com/gage-technologies/mistral-go v1.1.0
+ github.com/go-audio/audio v1.0.0
github.com/go-chi/chi/v5 v5.1.0
github.com/go-openapi/strfmt v0.23.0
github.com/go-redis/redismock/v9 v9.2.0
@@ -35,7 +36,6 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0
github.com/h2non/filetype v1.1.3
- github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1
github.com/iancoleman/strcase v0.3.0
github.com/influxdata/influxdb-client-go/v2 v2.12.3
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee
@@ -99,15 +99,21 @@ require (
gorm.io/plugin/dbresolver v1.5.1
)
+require github.com/dh1tw/gosamplerate v0.1.2 // indirect
+
require (
cloud.google.com/go v0.115.0 // indirect
cloud.google.com/go/auth v0.7.2 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect
cloud.google.com/go/compute/metadata v0.5.0 // indirect
filippo.io/edwards25519 v1.1.0 // indirect
+ github.com/JalfResi/justext v0.0.0-20221106200834-be571e3e3052 // indirect
github.com/PaesslerAG/gval v1.0.0 // indirect
+ github.com/PuerkitoBio/goquery v1.9.1
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
+ github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 // indirect
+ github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/antchfx/htmlquery v1.3.0 // indirect
github.com/antchfx/xmlquery v1.3.17 // indirect
github.com/antchfx/xpath v1.2.4 // indirect
@@ -116,16 +122,30 @@ require (
github.com/aws/aws-sdk-go v1.55.1 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/smithy-go v1.20.3 // indirect
+ github.com/catalinc/hashcash v0.0.0-20220723060415-5e3ec3e24f67 // indirect
+ github.com/cenkalti/backoff/v4 v4.2.1 // indirect
+ github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
+ github.com/davecgh/go-spew v1.1.1 // indirect
+ github.com/deepmap/oapi-codegen v1.8.2 // indirect
+ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/dlclark/regexp2 v1.10.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
github.com/emersion/go-sasl v0.0.0-20231106173351-e73c9f7bad43 // indirect
github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7 // indirect
+ github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect
+ github.com/fatih/set v0.2.1 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
+ github.com/fsnotify/fsnotify v1.6.0 // indirect
+ github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect
+ github.com/go-audio/riff v1.0.0 // indirect
+ github.com/go-audio/wav v1.1.0
github.com/go-ini/ini v1.67.0 // indirect
+ github.com/go-logr/logr v1.4.2 // indirect
+ github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/analysis v0.21.2 // indirect
github.com/go-openapi/errors v0.22.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
@@ -134,23 +154,41 @@ require (
github.com/go-openapi/spec v0.20.4 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/go-openapi/validate v0.21.0 // indirect
+ github.com/go-resty/resty/v2 v2.12.0
+ github.com/go-sql-driver/mysql v1.8.1
github.com/gobwas/glob v0.2.3 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/goccy/go-json v0.10.3 // indirect
+ github.com/gogo/googleapis v1.4.1 // indirect
+ github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/mock v1.6.0 // indirect
+ github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/flatbuffers v23.5.26+incompatible // indirect
github.com/google/go-querystring v1.1.0 // indirect
github.com/google/s2a-go v0.1.7 // indirect
+ github.com/google/uuid v1.6.0
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.5 // indirect
github.com/gorilla/websocket v1.5.1 // indirect
+ github.com/hashicorp/errwrap v1.1.0 // indirect
+ github.com/hashicorp/go-multierror v1.1.1 // indirect
+ github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 // indirect
github.com/itchyny/timefmt-go v0.1.5 // indirect
+ github.com/jackc/chunkreader/v2 v2.0.1 // indirect
+ github.com/jackc/pgconn v1.14.3
+ github.com/jackc/pgio v1.0.0 // indirect
+ github.com/jackc/pgpassfile v1.0.0 // indirect
+ github.com/jackc/pgproto3/v2 v2.3.3 // indirect
+ github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect
github.com/jackc/puddle/v2 v2.2.1 // indirect
+ github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect
+ github.com/jinzhu/inflection v1.0.0 // indirect
+ github.com/jinzhu/now v1.1.5 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect
@@ -159,19 +197,40 @@ require (
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/kr/text v0.2.0 // indirect
+ github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
+ github.com/lib/pq v1.10.9
github.com/mailru/easyjson v0.7.7 // indirect
+ github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/minio/md5-simd v1.1.2 // indirect
+ github.com/mitchellh/copystructure v1.2.0 // indirect
+ github.com/mitchellh/mapstructure v1.5.0 // indirect
+ github.com/mitchellh/reflectwalk v1.0.2 // indirect
+ github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/oklog/ulid v1.3.1 // indirect
+ github.com/olekukonko/tablewriter v0.0.4 // indirect
+ github.com/otiai10/gosseract/v2 v2.4.1 // indirect
+ github.com/pborman/uuid v1.2.1 // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
+ github.com/pkg/errors v0.9.1
+ github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
+ github.com/richardlehane/mscfb v1.0.4 // indirect
+ github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
+ github.com/robfig/cron v1.2.0 // indirect
+ github.com/rogpeppe/go-internal v1.11.0 // indirect
github.com/rs/xid v1.6.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
+ github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
github.com/shopspring/decimal v1.2.0 // indirect
+ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
+ github.com/streamer45/silero-vad-go v0.2.1
+ github.com/stretchr/objx v0.5.2 // indirect
+ github.com/stretchr/testify v1.9.0
github.com/temoto/robotstxt v1.1.2 // indirect
github.com/tidwall/gjson v1.14.4 // indirect
github.com/tidwall/match v1.1.1 // indirect
@@ -185,6 +244,7 @@ require (
github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 // indirect
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 // indirect
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
+ github.com/zaf/resample v1.5.0
github.com/zeebo/xxh3 v1.0.2 // indirect
gitlab.com/golang-commonmark/html v0.0.0-20191124015941-a22733972181 // indirect
gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
@@ -264,10 +324,12 @@ require (
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.10.0 // indirect
golang.org/x/crypto v0.26.0
+ golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.24.0 // indirect
golang.org/x/text v0.17.0
golang.org/x/time v0.5.0 // indirect
google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect
gopkg.in/yaml.v3 v3.0.1
gorm.io/driver/mysql v1.4.7 // indirect
+ modernc.org/mathutil v1.5.0 // indirect
)
diff --git a/go.sum b/go.sum
index b5892a129..ce630fe4d 100644
--- a/go.sum
+++ b/go.sum
@@ -783,6 +783,8 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/dh1tw/gosamplerate v0.1.2 h1:oyqtZk67xB9B4l+vIZCZ3F0RYV/z66W58VOah11/ktI=
+github.com/dh1tw/gosamplerate v0.1.2/go.mod h1:zooTyHpoR7hE+FLfdE3yjLHb2QA2NpMusNfuaZqEACM=
github.com/dhui/dktest v0.3.10 h1:0frpeeoM9pHouHjhLeZDuDTJ0PqjDTrycaHaMmkJAo8=
github.com/dhui/dktest v0.3.10/go.mod h1:h5Enh0nG3Qbo9WjNFRrwmKUaePEBhXMOygbz3Ww7Sz0=
github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4=
@@ -890,6 +892,12 @@ github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2H
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I=
github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs=
+github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
+github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
+github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
+github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
+github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
+github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs=
github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw=
github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
@@ -1260,8 +1268,6 @@ github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d/go.mod h1:+NfK9FKe
github.com/hjson/hjson-go/v4 v4.0.0 h1:wlm6IYYqHjOdXH1gHev4VoXCaW20HdQAGCxdOEEg2cs=
github.com/hjson/hjson-go/v4 v4.0.0/go.mod h1:KaYt3bTw3zhBjYqnXkYywcYctk0A2nxeEFTse3rH13E=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
-github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1 h1:oqeURuHQrImMykykqJgFbStlaDXyY7JpXXrwXyjr9ls=
-github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1/go.mod h1:tKRg0K9YmfD3eD6KFos+YHIVMouKMzxDSK5XpdxdCUI=
github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
github.com/iancoleman/strcase v0.3.0 h1:nTXanmYxhfFAMjZL34Ov6gkzEsSJZ5DbhxWjvSASxEI=
github.com/iancoleman/strcase v0.3.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
@@ -1548,6 +1554,8 @@ github.com/mitchellh/osext v0.0.0-20151018003038-5e2d6d41470f/go.mod h1:OkQIRizQ
github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
+github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk=
+github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc=
@@ -1839,6 +1847,8 @@ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cma
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
github.com/stefanberger/go-pkcs11uri v0.0.0-20201008174630-78d3cae3a980/go.mod h1:AO3tvPzVZ/ayst6UlUKUv6rcPQInYe3IknH3jYhAKu8=
github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
+github.com/streamer45/silero-vad-go v0.2.1 h1:Li1/tTC4H/3cyw6q4weX+U8GWwEL3lTekK/nYa1Cvuk=
+github.com/streamer45/silero-vad-go v0.2.1/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
github.com/stretchr/objx v0.0.0-20180129172003-8a3f7159479f/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -1947,6 +1957,8 @@ github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5ta
github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43/go.mod h1:aX5oPXxHm3bOH+xeAttToC8pqch2ScQN/JoXYupl6xs=
github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50/go.mod h1:NUSPSUX/bi6SeDMUh6brw0nXpxHnc96TguQh0+r/ssA=
github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f/go.mod h1:GlGEuHIJweS1mbCqG+7vt2nvWLzLLnRHbXz5JKd/Qbg=
+github.com/zaf/resample v1.5.0 h1:c3yumHrV1cJoED8ZY2Ai3cehS8s0mJSroA9/vMaUcho=
+github.com/zaf/resample v1.5.0/go.mod h1:e4yWalfgRccQrnZSrkIxTqmMCOPhTi1xvYpNpRIB13k=
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
diff --git a/pkg/component/operator/audio/v0/.compogen/bottom.mdx b/pkg/component/operator/audio/v0/.compogen/bottom.mdx
index 1dee42721..31e5d6f97 100644
--- a/pkg/component/operator/audio/v0/.compogen/bottom.mdx
+++ b/pkg/component/operator/audio/v0/.compogen/bottom.mdx
@@ -5,36 +5,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p
```yaml
version: v1beta
component:
- audio-spliter:
+ audio-vad:
type: audio
- task: TASK_SLICE_AUDIO
input:
audio: ${variable.audio}
- end-time: ${variable.end_time}
- start-time: ${variable.start_time}
- get-transcription:
- type: openai
- task: TASK_SPEECH_RECOGNITION
+ min-silence-duration: 300
+ speech-pad: 10
+ task: TASK_DETECT_ACTIVITY
+ audio-segment:
+ type: audio
input:
- audio: ${audio-spliter.output.audio}
- model: whisper-1
- setup:
- api-key: ${secret.INSTILL_SECRET}
+ audio: ${variable.audio}
+ segments: ${audio-vad.output.segments}
+ task: TASK_SEGMENT
variable:
audio:
- title: audio
- description: the audio you want to get the transcription from
- instill-format: audio/*
- end_time:
- title: end-time
- description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds
- instill-format: number
- start_time:
- title: start-time
- description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds
- instill-format: number
+ title: Audio to test
+ description: Audio to test VAD and extraction
+ instill-format: audio
output:
- result:
- title: result
- value: ${get-transcription.output.text}
+ samples:
+ title: Output audio segments
+ description: Output extracted audio segments
+ value: ${audio-segment.output.audio-segments}
```
diff --git a/pkg/component/operator/audio/v0/README.mdx b/pkg/component/operator/audio/v0/README.mdx
index 12b170085..89afa080d 100644
--- a/pkg/component/operator/audio/v0/README.mdx
+++ b/pkg/component/operator/audio/v0/README.mdx
@@ -5,10 +5,10 @@ draft: false
description: "Learn about how to set up a VDP Audio component https://github.com/instill-ai/instill-core"
---
-The Audio component is an operator component that allows users to extract and manipulate audio from different sources.
+The Audio component is an operator component that allows users to operate audio data.
It can carry out the following tasks:
-- [Chunk Audios](#chunk-audios)
-- [Slice Audio](#slice-audio)
+- [Detect Activity](#detect-activity)
+- [Segment](#segment)
@@ -29,17 +29,18 @@ The component definition and tasks are defined in the [definition.json](https://
## Supported Tasks
-### Chunk Audios
+### Detect Activity
-Split audio file into chunks
+Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.
| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
-| Task ID (required) | `task` | string | `TASK_CHUNK_AUDIOS` |
-| Audio (required) | `audio` | string | Base64 encoded audio file to be split |
-| Chunk Count (required) | `chunk-count` | integer | Number of chunks to equally split the audio into |
+| Task ID (required) | `task` | string | `TASK_DETECT_ACTIVITY` |
+| Audio (required) | `audio` | string | Audio file to analyze for speech activity. |
+| Minimum Silence Duration | `min-silence-duration` | integer | Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments. |
+| Speech Pad | `speech-pad` | integer | Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech. |
@@ -51,25 +52,52 @@ Split audio file into chunks
| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
-| Audios | `audios` | array[string] | A list of base64 encoded audios |
+| [Segments](#detect-activity-segments) | `segments` | array[object] | Array of time segments representing detected speech activity. Each segment contains start and end times in seconds. |
-### Slice Audio
+
+ Output Objects in Detect Activity
-Specify a time range to slice an audio file
+Segments
+
+
+
+| Field | Field ID | Type | Note |
+| :--- | :--- | :--- | :--- |
+| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment. |
+| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment. |
+
+
+
+### Segment
+
+Segment audio data into pieces based on the provided time segments.
| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
-| Task ID (required) | `task` | string | `TASK_SLICE_AUDIO` |
-| Audio (required) | `audio` | string | Base64 encoded audio file to be sliced |
-| Start Time (required) | `start-time` | integer | Start time of the slice in seconds |
-| End Time (required) | `end-time` | integer | End time of the slice in seconds |
+| Task ID (required) | `task` | string | `TASK_SEGMENT` |
+| Audio (required) | `audio` | string | Audio data to segment. |
+| [Segments](#segment-segments) (required) | `segments` | array[object] | A list of time segments of audio data. |
+
+ Input Objects in Segment
+Segments
+
+A list of time segments of audio data.
+
+
+
+| Field | Field ID | Type | Note |
+| :--- | :--- | :--- | :--- |
+| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment. |
+| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment. |
+
+
@@ -77,7 +105,7 @@ Specify a time range to slice an audio file
| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
-| Audio | `audio` | string | Base64 encoded audio slice |
+| Audios | `audio-segments` | array[string] | A list of segmented audio data. |
@@ -88,36 +116,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p
```yaml
version: v1beta
component:
- audio-spliter:
+ audio-vad:
type: audio
- task: TASK_SLICE_AUDIO
input:
audio: ${variable.audio}
- end-time: ${variable.end_time}
- start-time: ${variable.start_time}
- get-transcription:
- type: openai
- task: TASK_SPEECH_RECOGNITION
+ min-silence-duration: 300
+ speech-pad: 10
+ task: TASK_DETECT_ACTIVITY
+ audio-segment:
+ type: audio
input:
- audio: ${audio-spliter.output.audio}
- model: whisper-1
- setup:
- api-key: ${secret.INSTILL_SECRET}
+ audio: ${variable.audio}
+ segments: ${audio-vad.output.segments}
+ task: TASK_SEGMENT
variable:
audio:
- title: audio
- description: the audio you want to get the transcription from
- instill-format: audio/*
- end_time:
- title: end-time
- description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds
- instill-format: number
- start_time:
- title: start-time
- description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds
- instill-format: number
+ title: Audio to test
+ description: Audio to test VAD and extraction
+ instill-format: audio
output:
- result:
- title: result
- value: ${get-transcription.output.text}
+ samples:
+ title: Output audio segments
+ description: Output extracted audio segments
+ value: ${audio-segment.output.audio-segments}
```
diff --git a/pkg/component/operator/audio/v0/audio.go b/pkg/component/operator/audio/v0/audio.go
new file mode 100644
index 000000000..52eb337b1
--- /dev/null
+++ b/pkg/component/operator/audio/v0/audio.go
@@ -0,0 +1,53 @@
+package audio
+
+import (
+ "bytes"
+ "fmt"
+
+ "github.com/go-audio/audio"
+ "github.com/go-audio/wav"
+ "github.com/instill-ai/pipeline-backend/pkg/data"
+ "github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+const (
+ sampleRate = 16000
+ numChannel = 1
+)
+
+func decodeAudioWAV(audioData format.Audio) (*audio.IntBuffer, *wav.Decoder, error) {
+
+ wavAudioData := audioData
+ var err error
+ if audioData.ContentType().String() != data.WAV {
+ wavAudioData, err = audioData.Convert(data.WAV)
+ if err != nil {
+ return nil, nil, fmt.Errorf("error converting audio data to WAV: %v", err)
+ }
+ }
+
+ binary, err := wavAudioData.Binary()
+ if err != nil {
+ return nil, nil, fmt.Errorf("error getting binary data for image: %v", err)
+ }
+
+ dec := wav.NewDecoder(bytes.NewReader(binary.ByteArray()))
+ if !dec.IsValidFile() {
+ return nil, nil, fmt.Errorf("invalid WAV file")
+ }
+
+ audioBuf := &audio.IntBuffer{
+ Format: &audio.Format{
+ NumChannels: int(dec.NumChans),
+ SampleRate: int(dec.SampleRate),
+ },
+ Data: make([]int, len(binary.ByteArray())),
+ SourceBitDepth: int(dec.BitDepth),
+ }
+
+ if _, err := dec.PCMBuffer(audioBuf); err != nil {
+ return nil, nil, fmt.Errorf("reading audio data: %w", err)
+ }
+
+ return audioBuf, dec, nil
+}
diff --git a/pkg/component/operator/audio/v0/audio_operation.go b/pkg/component/operator/audio/v0/audio_operation.go
deleted file mode 100644
index 6559b6cbe..000000000
--- a/pkg/component/operator/audio/v0/audio_operation.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package audio
-
-import (
- "bytes"
- "encoding/base64"
- "fmt"
- "time"
-
- "github.com/iFaceless/godub"
- "github.com/iFaceless/godub/wav"
- "google.golang.org/protobuf/types/known/structpb"
-
- "github.com/instill-ai/pipeline-backend/pkg/component/base"
- "github.com/instill-ai/pipeline-backend/pkg/component/internal/util"
-)
-
-type ChunkAudiosInput struct {
- Audio Audio `json:"audio"`
- ChunkCount int `json:"chunk-count"`
-}
-
-type ChunkAudiosOutput struct {
- Audios []Audio `json:"audios"`
-}
-
-type SliceAudioInput struct {
- Audio Audio `json:"audio"`
- StartTime int `json:"start-time"`
- EndTime int `json:"end-time"`
-}
-
-type SliceAudioOutput struct {
- Audio Audio `json:"audio"`
-}
-
-type ConcatenateInput struct {
- Audios []Audio `json:"audios"`
-}
-
-type ConcatenateOutput struct {
- Audio Audio `json:"audio"`
-}
-
-// Base64 encoded audio
-type Audio string
-
-func chunkAudios(input *structpb.Struct) (*structpb.Struct, error) {
-
- var inputStruct ChunkAudiosInput
-
- err := base.ConvertFromStructpb(input, &inputStruct)
- if err != nil {
- return nil, err
- }
-
- buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio)))
- if err != nil {
- return nil, err
- }
-
- segment, err := godub.NewLoader().Load(bytes.NewReader(buf))
-
- if err != nil {
- return nil, fmt.Errorf("failed to load audio: %w", err)
- }
-
- duration := segment.Duration()
-
- chunkSeconds := float64(duration) / float64(inputStruct.ChunkCount)
-
- var audioSegments []*godub.AudioSegment
-
- var startTime time.Duration
- for i := 0; i < inputStruct.ChunkCount; i++ {
- startTime = getStartTime(chunkSeconds, i)
- endTime := getEndTime(chunkSeconds, i, inputStruct.ChunkCount, duration)
-
- slicedSegment, err := segment.Slice(startTime, endTime)
- if err != nil {
- return nil, fmt.Errorf("failed to slice audio: %w in chunk %v", err, i)
- }
- audioSegments = append(audioSegments, slicedSegment)
- }
-
- var audios []Audio
- prefix := "data:audio/wav;base64,"
- for _, segment := range audioSegments {
- var wavBuf bytes.Buffer
- err = wav.Encode(&wavBuf, segment.AsWaveAudio())
-
- if err != nil {
- return nil, fmt.Errorf("failed to encode audio to wav: %w", err)
- }
-
- audios = append(audios, Audio(prefix+base64.StdEncoding.EncodeToString(wavBuf.Bytes())))
- }
-
- output := ChunkAudiosOutput{
- Audios: audios,
- }
-
- return base.ConvertToStructpb(output)
-}
-
-func sliceAudio(input *structpb.Struct) (*structpb.Struct, error) {
-
- var inputStruct SliceAudioInput
-
- err := base.ConvertFromStructpb(input, &inputStruct)
- if err != nil {
- return nil, err
- }
-
- buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio)))
- if err != nil {
- return nil, err
- }
-
- segment, err := godub.NewLoader().Load(bytes.NewReader(buf))
-
- if err != nil {
- return nil, fmt.Errorf("failed to load audio: %w", err)
- }
-
- startTime := time.Duration(inputStruct.StartTime) * time.Second
- endTime := time.Duration(inputStruct.EndTime) * time.Second
-
- slicedSegment, err := segment.Slice(startTime, endTime)
- if err != nil {
- return nil, fmt.Errorf("failed to slice audio: %w", err)
- }
-
- var wavBuf bytes.Buffer
- err = wav.Encode(&wavBuf, slicedSegment.AsWaveAudio())
- if err != nil {
- return nil, fmt.Errorf("failed to encode audio to wav: %w", err)
- }
-
- output := SliceAudioOutput{
- Audio: Audio("data:audio/wav;base64," + base64.StdEncoding.EncodeToString(wavBuf.Bytes())),
- }
-
- return base.ConvertToStructpb(output)
-}
-
-func getStartTime(chunkSeconds float64, i int) time.Duration {
- return time.Duration(chunkSeconds * float64(i))
-}
-
-func getEndTime(chunkSeconds float64, i, totalCount int, duration time.Duration) time.Duration {
- if i == totalCount-1 {
- return duration
- }
- return time.Duration(chunkSeconds * float64(i+1))
-}
diff --git a/pkg/component/operator/audio/v0/config/definition.json b/pkg/component/operator/audio/v0/config/definition.json
index 6e77ddad5..15f83127a 100644
--- a/pkg/component/operator/audio/v0/config/definition.json
+++ b/pkg/component/operator/audio/v0/config/definition.json
@@ -1,7 +1,7 @@
{
"availableTasks": [
- "TASK_CHUNK_AUDIOS",
- "TASK_SLICE_AUDIO"
+ "TASK_DETECT_ACTIVITY",
+ "TASK_SEGMENT"
],
"documentationUrl": "https://www.instill.tech/docs/component/operator/audio",
"icon": "assets/audio.svg",
@@ -13,6 +13,6 @@
"uid": "b5c75caa-9261-4757-bfbf-12e908f59f16",
"version": "0.1.0",
"sourceUrl": "https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/operator/audio/v0",
- "description": "Extract and manipulate audio from different sources",
+ "description": "Operate audio data.",
"releaseStage": "RELEASE_STAGE_ALPHA"
}
diff --git a/pkg/component/operator/audio/v0/config/tasks.json b/pkg/component/operator/audio/v0/config/tasks.json
index b2fa6de39..3eedbc0c3 100644
--- a/pkg/component/operator/audio/v0/config/tasks.json
+++ b/pkg/component/operator/audio/v0/config/tasks.json
@@ -1,18 +1,41 @@
{
- "TASK_CHUNK_AUDIOS": {
- "instillShortDescription": "Split audio file into chunks",
- "input": {
- "description": "Audio file to split",
- "instillEditOnNodeFields": [
- "audio",
- "chunk-count"
+ "$defs": {
+ "segment": {
+ "properties": {
+ "start-time": {
+ "title": "Start Time",
+ "type": "number",
+ "description": "The number of seconds from the beginning of the audio file to the start of this segment.",
+ "instillFormat": "number",
+ "instillUIOrder": 0
+ },
+ "end-time": {
+ "title": "End Time",
+ "type": "number",
+ "description": "The number of seconds from the beginning of the audio file to the end of this segment.",
+ "instillFormat": "number",
+ "instillUIOrder": 1
+ }
+ },
+ "required": [
+ "start-time",
+ "end-time"
],
+ "title": "Segment",
+ "type": "object",
+ "description": "A time segment of audio data, defined by its start and end times in seconds."
+ }
+ },
+ "TASK_DETECT_ACTIVITY": {
+ "instillShortDescription": "Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.",
+ "input": {
+ "description": "Input",
"instillUIOrder": 0,
"properties": {
"audio": {
- "description": "Base64 encoded audio file to be split",
+ "description": "Audio file to analyze for speech activity.",
"instillAcceptFormats": [
- "audio/*",
+ "audio/wav",
"application/octet-stream"
],
"instillUIOrder": 0,
@@ -22,63 +45,67 @@
"title": "Audio",
"type": "string"
},
- "chunk-count": {
- "description": "Number of chunks to equally split the audio into",
+ "min-silence-duration": {
+ "description": "Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments.",
"instillAcceptFormats": [
"integer",
"number"
],
- "instillUpstreamTypes": [
- "reference",
- "value"
- ],
"instillUIOrder": 1,
- "title": "Chunk count",
- "type": "integer"
+ "type": "integer",
+ "minimum": 0,
+ "title": "Minimum Silence Duration",
+ "default": 100
+ },
+ "speech-pad": {
+ "description": "Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech.",
+ "instillAcceptFormats": [
+ "integer",
+ "number"
+ ],
+ "instillUIOrder": 2,
+ "type": "integer",
+ "minimum": 0,
+ "title": "Speech Pad",
+ "default": 30
}
},
"required": [
- "audio",
- "chunk-count"
+ "audio"
],
"title": "Input",
"type": "object"
},
"output": {
+ "description": "Output",
"instillUIOrder": 0,
"properties": {
- "audios": {
- "description": "A list of base64 encoded audios",
- "instillFormat": "array:audio/wav",
+ "segments": {
+ "description": "Array of time segments representing detected speech activity. Each segment contains start and end times in seconds.",
+ "instillFormat": "array:object",
"instillUIOrder": 0,
"items": {
- "type": "string",
- "title": "Audio"
+ "$ref": "#/$defs/segment"
},
- "title": "Audios",
+ "title": "Segments",
"type": "array"
}
},
"required": [
- "audios"
+ "segments"
],
"title": "Output",
"type": "object"
}
},
- "TASK_SLICE_AUDIO": {
- "instillShortDescription": "Specify a time range to slice an audio file",
+ "TASK_SEGMENT": {
+ "instillShortDescription": "Segment audio data into pieces based on the provided time segments.",
"input": {
- "description": "Audio file to slice",
- "instillEditOnNodeFields": [
- "audio",
- "start-time",
- "end-time"
- ],
+ "description": "Input",
"instillUIOrder": 0,
"properties": {
"audio": {
- "description": "Base64 encoded audio file to be sliced",
+ "description": "Audio data to segment.",
"instillAcceptFormats": [
"audio/*",
"application/octet-stream"
@@ -90,56 +117,42 @@
"title": "Audio",
"type": "string"
},
- "start-time": {
- "description": "Start time of the slice in seconds",
- "instillAcceptFormats": [
- "integer",
- "number"
- ],
- "instillUpstreamTypes": [
- "reference",
- "value"
- ],
+ "segments": {
+ "description": "A list of time segments of audio data.",
+ "instillFormat": "array:object",
"instillUIOrder": 1,
- "title": "Start time",
- "type": "integer"
- },
- "end-time": {
- "description": "End time of the slice in seconds",
- "instillAcceptFormats": [
- "integer",
- "number"
- ],
- "instillUpstreamTypes": [
- "reference",
- "value"
- ],
- "instillUIOrder": 2,
- "title": "End time",
- "type": "integer"
+ "items": {
+ "$ref": "#/$defs/segment"
+ },
+ "title": "Segments",
+ "type": "array"
}
},
"required": [
"audio",
- "start-time",
- "end-time"
+ "segments"
],
"title": "Input",
"type": "object"
},
"output": {
+ "description": "Output",
"instillUIOrder": 0,
"properties": {
- "audio": {
- "description": "Base64 encoded audio slice",
- "instillFormat": "audio/wav",
+ "audio-segments": {
+ "description": "A list of segmented audio data.",
+ "instillFormat": "array:audio/*",
"instillUIOrder": 0,
- "title": "Audio",
- "type": "string"
+ "items": {
+ "type": "string",
+ "title": "Audio"
+ },
+ "title": "Audios",
+ "type": "array"
}
},
"required": [
- "audio"
+ "audio-segments"
],
"title": "Output",
"type": "object"
diff --git a/pkg/component/operator/audio/v0/io.go b/pkg/component/operator/audio/v0/io.go
new file mode 100644
index 000000000..5dffc7e63
--- /dev/null
+++ b/pkg/component/operator/audio/v0/io.go
@@ -0,0 +1,29 @@
+package audio
+
+import (
+ "github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+type segmentData struct {
+ StartTime float64 `instill:"start-time"`
+ EndTime float64 `instill:"end-time"`
+}
+
+type detectActivityInput struct {
+ Audio format.Audio `instill:"audio"`
+ MinSilenceDuration int `instill:"min-silence-duration"`
+ SpeechPad int `instill:"speech-pad"`
+}
+
+type detectActivityOutput struct {
+ Segments []segmentData `instill:"segments"`
+}
+
+type segmentInput struct {
+ Audio format.Audio `instill:"audio"`
+ Segments []segmentData `instill:"segments"`
+}
+
+type segmentOutput struct {
+ AudioSegments []format.Audio `instill:"audio-segments"`
+}
diff --git a/pkg/component/operator/audio/v0/main.go b/pkg/component/operator/audio/v0/main.go
index bb4a6d0d7..ba92308eb 100644
--- a/pkg/component/operator/audio/v0/main.go
+++ b/pkg/component/operator/audio/v0/main.go
@@ -8,14 +8,12 @@ import (
_ "embed"
- "google.golang.org/protobuf/types/known/structpb"
-
"github.com/instill-ai/pipeline-backend/pkg/component/base"
)
const (
- taskChunkAudios string = "TASK_CHUNK_AUDIOS"
- taskSliceAudio string = "TASK_SLICE_AUDIO"
+ taskDetectActivity = "TASK_DETECT_ACTIVITY"
+ taskSegment = "TASK_SEGMENT"
)
var (
@@ -33,8 +31,7 @@ type component struct {
type execution struct {
base.ComponentExecution
-
- execute func(*structpb.Struct) (*structpb.Struct, error)
+ execute func(context.Context, *base.Job) error
}
func Init(bc base.Component) *component {
@@ -54,10 +51,10 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
e := &execution{ComponentExecution: x}
switch x.Task {
- case taskChunkAudios:
- e.execute = chunkAudios
- case taskSliceAudio:
- e.execute = sliceAudio
+ case taskDetectActivity:
+ e.execute = detectActivity
+ case taskSegment:
+ e.execute = segment
default:
return nil, fmt.Errorf("%s task is not supported", x.Task)
}
@@ -66,5 +63,5 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
}
func (e *execution) Execute(ctx context.Context, jobs []*base.Job) error {
- return base.SequentialExecutor(ctx, jobs, e.execute)
+ return base.ConcurrentExecutor(ctx, jobs, e.execute)
}
diff --git a/pkg/component/operator/audio/v0/main_test.go b/pkg/component/operator/audio/v0/main_test.go
deleted file mode 100644
index 9aead4749..000000000
--- a/pkg/component/operator/audio/v0/main_test.go
+++ /dev/null
@@ -1,4 +0,0 @@
-package audio
-
-// TODO chuang8511 Investigate how to run test case with installing ffmpeg in test env
-// It will be arranged according to the product schedule
diff --git a/pkg/component/operator/audio/v0/task_detect_activity.go b/pkg/component/operator/audio/v0/task_detect_activity.go
new file mode 100644
index 000000000..604294f6a
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity.go
@@ -0,0 +1,137 @@
+//go:build onnx
+// +build onnx
+
+// This task requires ONNX Runtime to be installed. Follow these steps to set it up:
+//
+// 1. Download ONNX Runtime:
+// - Visit the official repository: https://github.com/microsoft/onnxruntime/releases
+// - Choose the latest version compatible with your OS architecture
+//
+// 2. Install ONNX Runtime:
+// - Extract the downloaded tar file to a directory (referred to as ONNXRUNTIME_ROOT_PATH)
+// - Set up the environment:
+// export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include
+// export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+// export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+
+// This task requires the following libraries to be installed:
+// - libsoxr-dev (required for github.com/zaf/resample)
+
+package audio
+
+import (
+ "bytes"
+ "context"
+ "encoding/binary"
+ "fmt"
+ "math"
+ "os"
+ "path/filepath"
+
+ "github.com/go-audio/audio"
+ "github.com/streamer45/silero-vad-go/speech"
+ "github.com/zaf/resample"
+
+ "github.com/instill-ai/pipeline-backend/pkg/component/base"
+)
+
+func detectActivity(ctx context.Context, job *base.Job) error {
+ var input detectActivityInput
+ if err := job.Input.ReadData(ctx, &input); err != nil {
+ return err
+ }
+
+ audioBuf, dec, err := decodeAudioWAV(input.Audio)
+ if err != nil {
+ return err
+ }
+
+ if audioBuf.Format.NumChannels > numChannel {
+ audioBuf = toMono(audioBuf)
+ }
+
+ if audioBuf.Format.SampleRate != sampleRate {
+ resampledData, err := resampleAudio(audioBuf.AsFloatBuffer().Data, float64(dec.SampleRate), float64(sampleRate), audioBuf.Format.NumChannels)
+ if err != nil {
+ return fmt.Errorf("resampling audio: %w", err)
+ }
+ audioBuf.Format.SampleRate = sampleRate
+ audioBuf.Data = resampledData
+ }
+
+ sd, err := speech.NewDetector(speech.DetectorConfig{
+ ModelPath: filepath.Join(os.Getenv("ONNX_MODEL_FOLDER_PATH"), "silero_vad.onnx"),
+ SampleRate: sampleRate,
+ Threshold: 0.5,
+ MinSilenceDurationMs: input.MinSilenceDuration,
+ SpeechPadMs: input.SpeechPad,
+ })
+ if err != nil {
+ return fmt.Errorf("creating voice activity detector: %w", err)
+ }
+
+ defer func() {
+ if removeErr := sd.Destroy(); removeErr != nil {
+ if err == nil {
+ err = fmt.Errorf("destroy speech detector: %w", removeErr)
+ }
+ }
+ }()
+
+ segments, err := sd.Detect(audioBuf.AsFloat32Buffer().Data)
+ if err != nil {
+ return fmt.Errorf("detect voice activity: %w", err)
+ }
+
+ dao := detectActivityOutput{
+ Segments: make([]segmentData, len(segments)),
+ }
+ for i, s := range segments {
+ dao.Segments[i] = segmentData{StartTime: s.SpeechStartAt, EndTime: s.SpeechEndAt}
+ }
+
+ if err := job.Output.WriteData(ctx, dao); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func toMono(buffer *audio.IntBuffer) *audio.IntBuffer {
+ for i := 0; i < len(buffer.Data)/2; i++ {
+ buffer.Data[i] = (buffer.Data[2*i] + buffer.Data[2*i+1]) / 2
+ }
+ buffer.Data = buffer.Data[:len(buffer.Data)/2]
+ buffer.Format.NumChannels = 1
+ return buffer
+}
+
+func resampleAudio(input []float64, inputRate, outputRate float64, channels int) ([]int, error) {
+ var buf bytes.Buffer
+ resampler, err := resample.New(&buf, inputRate, outputRate, channels, resample.F64, resample.HighQ)
+ if err != nil {
+ return nil, fmt.Errorf("creating resampler: %w", err)
+ }
+ defer resampler.Close()
+
+ // Convert []float64 to []byte
+ inputBytes := make([]byte, len(input)*8)
+ for i, v := range input {
+ binary.LittleEndian.PutUint64(inputBytes[i*8:], math.Float64bits(v))
+ }
+
+ _, err = resampler.Write(inputBytes)
+ if err != nil {
+ return nil, fmt.Errorf("writing to resampler: %w", err)
+ }
+
+ // Convert resampled []byte back to []int
+ resampledBytes := buf.Bytes()
+ resampledData := make([]int, len(resampledBytes)/8)
+ for i := 0; i < len(resampledData); i++ {
+ resampledFloat := math.Float64frombits(binary.LittleEndian.Uint64(resampledBytes[i*8:]))
+ resampledData[i] = int(resampledFloat)
+ }
+
+ return resampledData, nil
+}
diff --git a/pkg/component/operator/audio/v0/task_detect_activity_nontag.go b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go
new file mode 100644
index 000000000..c20f2325b
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go
@@ -0,0 +1,15 @@
+//go:build !onnx
+// +build !onnx
+
+package audio
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/instill-ai/pipeline-backend/pkg/component/base"
+)
+
+func detectActivity(ctx context.Context, job *base.Job) error {
+ return fmt.Errorf("the Audio operator wasn't built with onnxruntime")
+}
diff --git a/pkg/component/operator/audio/v0/task_detect_activity_test.go b/pkg/component/operator/audio/v0/task_detect_activity_test.go
new file mode 100644
index 000000000..6b84d67e4
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity_test.go
@@ -0,0 +1,159 @@
+//go:build onnx
+// +build onnx
+
+package audio
+
+import (
+ "context"
+ "encoding/json"
+ "io"
+ "math"
+ "os"
+ "testing"
+
+ "github.com/go-audio/audio"
+ "github.com/google/go-cmp/cmp"
+
+ qt "github.com/frankban/quicktest"
+
+ "github.com/instill-ai/pipeline-backend/pkg/component/base"
+ "github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
+ "github.com/instill-ai/pipeline-backend/pkg/data"
+)
+
+func TestDetectActivity(t *testing.T) {
+ c := qt.New(t)
+
+ testCases := []struct {
+ name string
+ audioFile string
+ sampleRate int
+ threshold float64
+ silenceDuration int
+ speechPad int
+ wantSegments string
+ expectedError string
+ }{
+ {
+ name: "ok - detect voice activity (voice1)",
+ audioFile: "testdata/voice1.wav",
+ sampleRate: 16000,
+ threshold: 0.5,
+ silenceDuration: 500,
+ speechPad: 100,
+ wantSegments: "testdata/voice1-activity-segments.json",
+ },
+ {
+ name: "ok - detect voice activity (voice2)",
+ audioFile: "testdata/voice2.wav",
+ sampleRate: 16000,
+ threshold: 0.5,
+ silenceDuration: 500,
+ speechPad: 30,
+ wantSegments: "testdata/voice2-activity-segments.json",
+ },
+ }
+
+ for _, tc := range testCases {
+ c.Run(tc.name, func(c *qt.C) {
+ component := Init(base.Component{})
+ c.Assert(component, qt.IsNotNil)
+
+ execution, err := component.CreateExecution(base.ComponentExecution{
+ Component: component,
+ Task: taskDetectActivity,
+ })
+ c.Assert(err, qt.IsNil)
+ c.Assert(execution, qt.IsNotNil)
+
+ ir, ow, eh, job := mock.GenerateMockJob(c)
+
+ // Load audio data
+ audioFile, err := os.Open(tc.audioFile)
+ c.Assert(err, qt.IsNil)
+ defer audioFile.Close()
+ audioData, err := io.ReadAll(audioFile)
+ c.Assert(err, qt.IsNil)
+
+ ir.ReadDataMock.Set(func(ctx context.Context, input any) error {
+ switch input := input.(type) {
+ case *detectActivityInput:
+ audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav")
+ c.Assert(err, qt.IsNil)
+ *input = detectActivityInput{
+ Audio: audio,
+ MinSilenceDuration: tc.silenceDuration,
+ SpeechPad: tc.speechPad,
+ }
+ }
+ return nil
+ })
+
+ var capturedOutput detectActivityOutput
+ ow.WriteDataMock.Set(func(ctx context.Context, output any) error {
+ capturedOutput = output.(detectActivityOutput)
+ return nil
+ })
+
+ eh.ErrorMock.Set(func(ctx context.Context, err error) {
+ c.Assert(err, qt.ErrorMatches, tc.expectedError)
+ })
+
+ if tc.expectedError != "" {
+ ow.WriteDataMock.Optional()
+ } else {
+ eh.ErrorMock.Optional()
+ }
+
+ err = execution.Execute(context.Background(), []*base.Job{job})
+
+ if tc.expectedError == "" {
+ c.Assert(err, qt.IsNil)
+
+ // Load expected segments
+ expectedSegmentsJSONData, err := os.ReadFile(tc.wantSegments)
+ c.Assert(err, qt.IsNil)
+ var expectedSegmentsStruct struct {
+ Segments []segmentData `instill:"segments"`
+ }
+
+ var segmentsMap map[string]interface{}
+ err = json.Unmarshal(expectedSegmentsJSONData, &segmentsMap)
+ c.Assert(err, qt.IsNil)
+
+ jsonValue, err := data.NewJSONValue(segmentsMap)
+ c.Assert(err, qt.IsNil)
+
+ c.Assert(data.Unmarshal(jsonValue, &expectedSegmentsStruct), qt.IsNil)
+ expectedSegments := expectedSegmentsStruct.Segments
+
+ c.Assert(capturedOutput.Segments, qt.HasLen, len(expectedSegments))
+
+ for i, actual := range capturedOutput.Segments {
+ expected := expectedSegments[i]
+ c.Assert(actual.StartTime, floatEquals(0.1), expected.StartTime)
+ c.Assert(actual.EndTime, floatEquals(0.1), expected.EndTime)
+ }
+ }
+ })
+ }
+}
+
+// floatEquals is a custom checker for comparing float64 values with an epsilon
+func floatEquals(epsilon float64) qt.Checker {
+ return qt.CmpEquals(cmp.Comparer(func(x, y float64) bool {
+ return math.Abs(x-y) <= epsilon
+ }))
+}
+
+func TestToMono(t *testing.T) {
+ c := qt.New(t)
+
+ stereoBuffer := &audio.IntBuffer{
+ Data: []int{1, 2, 3, 4, 5, 6},
+ Format: &audio.Format{NumChannels: 2},
+ }
+ monoBuffer := toMono(stereoBuffer)
+ c.Assert(monoBuffer.Format.NumChannels, qt.Equals, 1)
+ c.Assert(monoBuffer.Data, qt.DeepEquals, []int{1, 3, 5})
+}
diff --git a/pkg/component/operator/audio/v0/task_segment.go b/pkg/component/operator/audio/v0/task_segment.go
new file mode 100644
index 000000000..a7a54f605
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_segment.go
@@ -0,0 +1,107 @@
+package audio
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "os"
+
+ "github.com/go-audio/audio"
+ "github.com/go-audio/wav"
+
+ "github.com/instill-ai/pipeline-backend/pkg/component/base"
+ "github.com/instill-ai/pipeline-backend/pkg/data"
+ "github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+func segment(ctx context.Context, job *base.Job) error {
+
+ var input segmentInput
+ if err := job.Input.ReadData(ctx, &input); err != nil {
+ return err
+ }
+
+ audioBuf, dec, err := decodeAudioWAV(input.Audio)
+ if err != nil {
+ return err
+ }
+
+ output := segmentOutput{
+ AudioSegments: make([]format.Audio, len(input.Segments)),
+ }
+
+ for i, seg := range input.Segments {
+ seg, err := extractSegment(audioBuf, seg)
+ if err != nil {
+ return err
+ }
+ encSeg, err := encodeSegment(seg, audioBuf.Format, dec)
+ if err != nil {
+ return err
+ }
+ ad, err := data.NewAudioFromBytes(encSeg, "audio/wav", fmt.Sprintf("audio-segment-%d.wav", i))
+ if err != nil {
+ return err
+ }
+ output.AudioSegments[i] = ad
+ }
+
+ if err := job.Output.WriteData(ctx, output); err != nil {
+ return err
+ }
+
+ return nil
+
+}
+
+func extractSegment(audioBuf *audio.IntBuffer, seg segmentData) ([]int, error) {
+ startSample := int(seg.StartTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels))
+ endSample := int(seg.EndTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels))
+
+ if startSample < 0 {
+ startSample = 0
+ }
+ if endSample > len(audioBuf.Data) {
+ endSample = len(audioBuf.Data)
+ }
+
+ return audioBuf.Data[startSample:endSample], nil
+}
+
+func encodeSegment(segment []int, format *audio.Format, dec *wav.Decoder) ([]byte, error) {
+ // Use a temporary file instead of a buffer
+ tempFile, err := os.CreateTemp("", "audio_segment_*.wav")
+ if err != nil {
+ return nil, fmt.Errorf("failed to create temp file: %w", err)
+ }
+ defer os.Remove(tempFile.Name()) // Clean up the temp file when we're done
+
+ encoder := wav.NewEncoder(tempFile, format.SampleRate, int(dec.BitDepth), format.NumChannels, int(dec.WavAudioFormat))
+
+ segmentBuf := &audio.IntBuffer{
+ Format: format,
+ Data: segment,
+ SourceBitDepth: int(dec.BitDepth),
+ }
+
+ if err := encoder.Write(segmentBuf); err != nil {
+ return nil, fmt.Errorf("failed to write segment to buffer: %w", err)
+ }
+
+ if err := encoder.Close(); err != nil {
+ return nil, fmt.Errorf("failed to close the encoder: %w", err)
+ }
+
+ // Read the contents of the temp file
+ _, err = tempFile.Seek(0, 0)
+ if err != nil {
+ return nil, fmt.Errorf("failed to seek to the beginning of temp file: %w", err)
+ }
+
+ fileContents, err := io.ReadAll(tempFile)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read temp file: %w", err)
+ }
+
+ return fileContents, nil
+}
diff --git a/pkg/component/operator/audio/v0/task_segment_test.go b/pkg/component/operator/audio/v0/task_segment_test.go
new file mode 100644
index 000000000..bc9a0221f
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_segment_test.go
@@ -0,0 +1,115 @@
+package audio
+
+import (
+ "context"
+ "encoding/json"
+ "io"
+ "os"
+ "testing"
+
+ qt "github.com/frankban/quicktest"
+
+ "github.com/instill-ai/pipeline-backend/pkg/component/base"
+ "github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
+ "github.com/instill-ai/pipeline-backend/pkg/data"
+)
+
+func TestSegment(t *testing.T) {
+ c := qt.New(t)
+
+ testCases := []struct {
+ name string
+ audioFile string
+ segmentsFile string
+ expectedCount int
+ expectedError string
+ }{
+ {
+ name: "ok - valid segmentation",
+ audioFile: "testdata/voice1.wav",
+ segmentsFile: "testdata/voice1-activity-segments.json",
+ expectedCount: 5,
+ },
+ }
+
+ for _, tc := range testCases {
+ c.Run(tc.name, func(c *qt.C) {
+ component := Init(base.Component{})
+ c.Assert(component, qt.IsNotNil)
+
+ execution, err := component.CreateExecution(base.ComponentExecution{
+ Component: component,
+ Task: taskSegment,
+ })
+ c.Assert(err, qt.IsNil)
+ c.Assert(execution, qt.IsNotNil)
+
+ ir, ow, eh, job := mock.GenerateMockJob(c)
+
+ // Load audio data
+ audioFile, err := os.Open(tc.audioFile)
+ c.Assert(err, qt.IsNil)
+ defer audioFile.Close()
+ audioData, err := io.ReadAll(audioFile)
+ c.Assert(err, qt.IsNil)
+
+ // Load segments data
+ segmentsJSONData, err := os.ReadFile(tc.segmentsFile)
+ c.Assert(err, qt.IsNil)
+ var segmentsStruct struct {
+ Segments []segmentData `instill:"segments"`
+ }
+
+ var segmentsMap map[string]interface{}
+ err = json.Unmarshal(segmentsJSONData, &segmentsMap)
+ c.Assert(err, qt.IsNil)
+
+ jsonValue, err := data.NewJSONValue(segmentsMap)
+ c.Assert(err, qt.IsNil)
+
+ c.Assert(data.Unmarshal(jsonValue, &segmentsStruct), qt.IsNil)
+ segments := segmentsStruct.Segments
+
+ ir.ReadDataMock.Set(func(ctx context.Context, input any) error {
+ switch input := input.(type) {
+ case *segmentInput:
+ audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav")
+ c.Assert(err, qt.IsNil)
+ *input = segmentInput{
+ Audio: audio,
+ Segments: segments,
+ }
+ }
+ return nil
+ })
+
+ var capturedOutput segmentOutput
+ ow.WriteDataMock.Set(func(ctx context.Context, output any) error {
+ capturedOutput = output.(segmentOutput)
+ return nil
+ })
+
+ eh.ErrorMock.Set(func(ctx context.Context, err error) {
+ c.Assert(err, qt.ErrorMatches, tc.expectedError)
+ })
+
+ if tc.expectedError != "" {
+ ow.WriteDataMock.Optional()
+ } else {
+ eh.ErrorMock.Optional()
+ }
+
+ err = execution.Execute(context.Background(), []*base.Job{job})
+
+ if tc.expectedError == "" {
+ c.Assert(err, qt.IsNil)
+ c.Assert(capturedOutput.AudioSegments, qt.HasLen, tc.expectedCount)
+
+ for i, segment := range capturedOutput.AudioSegments {
+ c.Assert(segment, qt.Not(qt.IsNil), qt.Commentf("Segment %d is nil", i))
+ c.Assert(segment.ContentType().String(), qt.Equals, "audio/ogg", qt.Commentf("Segment %d has incorrect MIME type", i))
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json
new file mode 100644
index 000000000..d72f2c729
--- /dev/null
+++ b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json
@@ -0,0 +1,24 @@
+{
+ "segments": [
+ {
+ "start-time": 1.5,
+ "end-time": 2.404
+ },
+ {
+ "start-time": 3.196,
+ "end-time": 4.068
+ },
+ {
+ "start-time": 4.604,
+ "end-time": 5.764
+ },
+ {
+ "start-time": 6.62,
+ "end-time": 14.948
+ },
+ {
+ "start-time": 15.836,
+ "end-time": 18.564
+ }
+ ]
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice1.wav b/pkg/component/operator/audio/v0/testdata/voice1.wav
new file mode 100644
index 000000000..cfc16301e
Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice1.wav differ
diff --git a/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json
new file mode 100644
index 000000000..e6f7d5f57
--- /dev/null
+++ b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json
@@ -0,0 +1,16 @@
+{
+ "segments": [
+ {
+ "start-time": 0.002,
+ "end-time": 9.406
+ },
+ {
+ "start-time": 10.146,
+ "end-time": 18.782
+ },
+ {
+ "start-time": 19.234,
+ "end-time": 30.878
+ }
+ ]
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice2.wav b/pkg/component/operator/audio/v0/testdata/voice2.wav
new file mode 100644
index 000000000..0d5dc63de
Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice2.wav differ
diff --git a/pkg/component/operator/document/v0/convert_test.go b/pkg/component/operator/document/v0/convert_test.go
index 10b5985d1..8afe651a2 100644
--- a/pkg/component/operator/document/v0/convert_test.go
+++ b/pkg/component/operator/document/v0/convert_test.go
@@ -28,22 +28,20 @@ func TestConvertToText(t *testing.T) {
expected: ConvertToTextOutput{
Body: "This is test file for markdown",
Meta: map[string]string{
- "Custom Metadata": "no",
- "Encrypted": "no",
- "File size": "15489 bytes",
- "Form": "none",
- "JavaScript": "no",
- "Metadata Stream": "no",
- "Optimized": "no",
- "PDF version": "1.4",
- "Page rot": "0",
- "Page size": "596 x 842 pts (A4)",
- "Pages": "1",
- "Producer": "Skia/PDF m128 Google Docs Renderer",
- "Suspects": "no",
- "Tagged": "no",
- "Title": "Untitled document",
- "UserProperties": "no",
+ "Encrypted": "no",
+ "File size": "15489 bytes",
+ "Form": "none",
+ "JavaScript": "no",
+ "Optimized": "no",
+ "PDF version": "1.4",
+ "Page rot": "0",
+ "Page size": "596 x 842 pts (A4)",
+ "Pages": "1",
+ "Producer": "Skia/PDF m128 Google Docs Renderer",
+ "Suspects": "no",
+ "Tagged": "no",
+ "Title": "Untitled document",
+ "UserProperties": "no",
},
MSecs: 3,
},
diff --git a/pkg/component/resources/onnx/silero_vad.onnx b/pkg/component/resources/onnx/silero_vad.onnx
new file mode 100644
index 000000000..d0ccd9d7f
Binary files /dev/null and b/pkg/component/resources/onnx/silero_vad.onnx differ