diff --git a/.env b/.env index e152f994b..8d349a073 100644 --- a/.env +++ b/.env @@ -12,6 +12,12 @@ PUBLIC_SERVICE_PORT=8081 DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 +# ONNX_MODEL_FOLDER_PATH specifies the directory where ONNX models are stored. +# These models are loaded dynamically at runtime. The path is set relative to +# the project root, allowing for consistent model loading across different +# deployment environments. +ONNX_MODEL_FOLDER_PATH=${PWD}/pkg/component/resources/onnx + # test # TEST_DBHOST and TEST_DBNAME are used to initialize a separate database for diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ad681058e..3dde06506 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -85,11 +85,9 @@ will create and migrate a test database to keep these queries isolated from the main DB. You can set the database host and name by overriding the `TEST_DBHOST` and `TEST_DBNAME` values. -Certain tests depend on the [`docconv`](https://github.com/sajari/docconv) -package and aren't run by default. You can trigger them by adding the `OCR=true` -flag to the coverage command. Make sure to install the [package -dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies) -first. +Certain tests depend on external packages and aren't run by default: +- For [`docconv`](https://github.com/sajari/docconv) tests, add `OCR=true` flag and install its [dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies). +- For [`onnxruntime`](https://github.com/microsoft/onnxruntime) tests, add `ONNX=true` flag. Follow the [guideline](#set-up-onnx-runtime) to set up ONNX Runtime (Linux only). #### Run the integration tests @@ -111,6 +109,22 @@ If empty, tests will try to connect to `localhost:5432`. $ make rm ``` +### Set up ONNX Runtime (Linux only) + +1. Download the latest [ONNX Runtime release](https://github.com/microsoft/onnxruntime/releases) for your system. + +2. Install ONNX Runtime: + ```bash + sudo mkdir -p /usr/local/onnxruntime + sudo tar -xzf onnxruntime-*-*-*.tgz -C /usr/local/onnxruntime --strip-components=1 + export ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime + export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib + export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib + export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include + ``` + +**Note:** If you don't have sudo access, extract to a user-writeable location (e.g., `~/onnxruntime`), set `ONNXRUNTIME_ROOT_PATH` accordingly, and adjust the environment variables as shown above. No need to create symlinks in this case. + ## Codebase contribution ### Pre-commit hooks diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index d4a85df7e..2c2c710b6 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -38,10 +38,23 @@ jobs: - uses: actions/checkout@v3 + - name: Install onnxruntime library and headers + run: | + export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime + LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) + ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") + wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} + rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV + echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV + echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV + - name: Generate coverage report run: | make build-dev - make coverage DBTEST=true OCR=true + make coverage DBTEST=true OCR=true ONNX=true - name: Upload coverage report uses: codecov/codecov-action@v2 diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 2483e2547..0f3f6372b 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -21,8 +21,24 @@ jobs: with: go-version: ${{ env.GOLANG_VERSION }} cache: false + - name: Install sorx + run: | + sudo apt-get update + sudo apt-get install -y libsoxr-dev + - name: Install onnxruntime library and headers + run: | + export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime + LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) + ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") + wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} + rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz + echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV + echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV + echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.59 - args: --timeout=10m + version: v1.61 + args: --timeout=10m --build-tags onnx diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9faea3c61..3de7cbc40 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,6 +12,7 @@ repos: rev: v0.5.1 hooks: - id: golangci-lint + args: ["--build-tags", "onnx"] - id: go-mod-tidy - repo: https://github.com/pinglin/conventional-pre-commit rev: v1.1.0 diff --git a/Dockerfile b/Dockerfile index 21088b6d6..12f91bbdf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,33 @@ ARG GOLANG_VERSION=1.22.5 -FROM golang:${GOLANG_VERSION}-alpine3.19 AS build - -RUN apk add --no-cache build-base leptonica-dev tesseract-ocr-dev musl-dev +FROM golang:${GOLANG_VERSION}-bullseye AS build + +ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION + +RUN apt-get update && apt-get install -y \ + build-essential \ + libleptonica-dev \ + libtesseract-dev \ + libsoxr-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install ONNX Runtime (latest release) +ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime +RUN apt update && \ + apt install -y wget jq && \ + LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \ + ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \ + wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \ + rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + apt remove -y wget jq && \ + apt autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +# Set environment variables and create symlinks for ONNX Runtime +ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include +ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib +ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib WORKDIR /src @@ -9,51 +35,28 @@ COPY go.mod go.sum ./ RUN go mod download COPY . . -RUN go get github.com/otiai10/gosseract/v2 - ARG SERVICE_NAME TARGETOS TARGETARCH +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=ocr,onnx -o /${SERVICE_NAME} ./cmd/main +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-migrate ./cmd/migration +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init ./cmd/init + +FROM debian:bullseye-slim -RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME} ./cmd/main -RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME}-worker ./cmd/worker -RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-migrate ./cmd/migration -RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-init ./cmd/init - -FROM alpine:3.19 - -RUN apk add --no-cache \ - curl \ - poppler-utils \ - wv \ - tidyhtml \ - libc6-compat \ - tesseract-ocr \ - python3 \ - py3-pip \ - build-base \ - python3-dev \ - libffi-dev \ - libreoffice \ - qpdf \ - msttcorefonts-installer \ - font-noto \ - font-noto-cjk \ - ffmpeg \ - chromium \ - && update-ms-fonts \ - && fc-cache -f \ - && python3 -m venv /opt/venv \ - && /opt/venv/bin/pip install --upgrade pip \ - && /opt/venv/bin/pip install pdfplumber tokenizers \ - && rm -rf /var/cache/apk/* /var/cache/fontconfig/* - -# Download tesseract data -RUN curl -L https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata \ - -o /usr/share/tessdata/eng.traineddata - -ARG TARGETARCH -ARG BUILDARCH -RUN apk add unrtf --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community +# Install Python, create virtual environment, and install pdfplumber +RUN apt update && \ + apt install -y curl python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \ + python3 -m venv /opt/venv && \ + /opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \ + rm -rf /var/lib/apt/lists/* +# copy ONNX runtime from build stage +ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime +COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_ROOT_PATH} + +# Set environment variables and create symlinks for ONNX Runtime +ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include +RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/ USER nobody:nogroup @@ -71,3 +74,7 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-migrate ./ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./ + +# Set up ONNX model and environment variable +COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx +ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx diff --git a/Dockerfile.dev b/Dockerfile.dev index a34325e65..6e5987dc0 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -1,5 +1,5 @@ ARG GOLANG_VERSION=1.22.5 -FROM golang:${GOLANG_VERSION} +FROM golang:${GOLANG_VERSION}-bullseye ARG SERVICE_NAME @@ -11,18 +11,39 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION # Install Python, create virtual environment, and install pdfplumber RUN apt update && \ - apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg chromium qpdf && \ + apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \ python3 -m venv /opt/venv && \ /opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \ rm -rf /var/lib/apt/lists/* +# Install ONNX Runtime (latest release) +ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime +RUN apt update && \ + apt install -y wget jq && \ + LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \ + ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \ + wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \ + rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \ + apt remove -y wget jq && \ + apt autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +# Set environment variables and create symlinks for ONNX Runtime +ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include +ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib +ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib + +# tparse +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/mfridman/tparse@latest + # air RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/cosmtrek/air@v1.49 # k6 -RUN go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION} -RUN go install github.com/mfridman/tparse@v0.15.0 -RUN xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6 +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION} +RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6 # -- set up Go @@ -38,6 +59,10 @@ ENV GOENV=/go/.config/go/env # required to restore compatibility with those versions. ENV GODEBUG=tlsrsakex=1 +# Set up ONNX model and environment variable +COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx +ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx + USER nobody:nogroup ENTRYPOINT ["tail", "-f", "/dev/null"] diff --git a/Makefile b/Makefile index 7e92334d0..a4827beb2 100644 --- a/Makefile +++ b/Makefile @@ -7,13 +7,6 @@ include .env export GOTEST_FLAGS := CFG_DATABASE_HOST=${TEST_DBHOST} CFG_DATABASE_NAME=${TEST_DBNAME} -ifeq (${DBTEST}, true) - GOTEST_TAGS := -tags=dbtest -endif -ifeq (${OCR}, true) - GOTEST_TAGS := -tags=ocr -endif - #============================================================================ @@ -40,10 +33,10 @@ latest: ## Run latest container echo "Run latest container ${SERVICE_NAME} and ${SERVICE_NAME}-worker. To stop it, run \"make stop\"." @docker run --network=instill-network \ --name ${SERVICE_NAME} \ - -d ${SERVICE_NAME}:latest ./${SERVICE_NAME} + -d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME} @docker run --network=instill-network \ --name ${SERVICE_NAME}-worker \ - -d ${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker + -d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker .PHONY: rm rm: ## Remove all running containers @@ -60,10 +53,10 @@ build-dev: ## Build dev docker image .PHONY: build-latest build-latest: ## Build latest docker image - @docker buildx build \ + @docker build \ --build-arg GOLANG_VERSION=${GOLANG_VERSION} \ --build-arg SERVICE_NAME=${SERVICE_NAME} \ - -t pipeline-backend:latest . + -t instill/pipeline-backend:latest . .PHONY: go-gen go-gen: ## Generate codes @@ -94,40 +87,32 @@ coverage: rm coverage.out; \ fi +# Tests should run in container without local tparse installation. +# If you encounter container test issues, install tparse locally: +# go install github.com/mfridman/tparse/cmd/tparse@latest .PHONY: test test: -# Ideally, it should be ok to run without installing tparse locally. -# However, there may be some issues that arise from running the tests -# in the container. If you encounter any issues, please install tparse -# locally via `go install github.com/mfridman/tparse/cmd/tparse@latest` -# and run the tests locally. - @if [ "${OCR}" = "true" ]; then \ - docker run --rm \ - -v $(PWD):/${SERVICE_NAME} \ - --user $(id -u):$(id -g) \ - --entrypoint= \ - instill/${SERVICE_NAME}:dev \ - make test-ocr; \ + @TAGS=""; \ + if [ "$${OCR}" = "true" ]; then \ + TAGS="$$TAGS,ocr"; \ + [ "$$(uname)" = "Darwin" ] && export TESSDATA_PREFIX=$$(dirname $$(brew list tesseract | grep share/tessdata/eng.traineddata)); \ + fi; \ + if [ "$${ONNX}" = "true" ]; then \ + if [ "$$(uname)" = "Darwin" ]; then \ + echo "ONNX Runtime test is not supported on Darwin (macOS)."; \ + else \ + TAGS="$$TAGS,onnx"; \ + fi; \ + fi; \ + TAGS=$${TAGS#,}; \ + if [ -n "$$TAGS" ]; then \ + echo "Running tests with tags: $$TAGS"; \ + go test -v -tags="$$TAGS" ./... -json | tparse --notests --all; \ else \ - docker run --rm \ - -v $(PWD):/${SERVICE_NAME} \ - --user $(id -u):$(id -g) \ - --entrypoint= \ - instill/${SERVICE_NAME}:dev \ - go test -v ./... -json | tparse --notests --all; \ + echo "Running standard tests"; \ + go test -v ./... -json | tparse --notests --all; \ fi -.PHONY: test-ocr -test-ocr: -# Certain component tests require additional dependencies. -# Install tesseract via `brew install tesseract` -# Setup `export LIBRARY_PATH="/opt/homebrew/lib"` `export CPATH="/opt/homebrew/include"` -ifeq ($(shell uname), Darwin) - @TESSDATA_PREFIX=$(shell dirname $(shell brew list tesseract | grep share/tessdata/eng.traineddata)) ${GOTEST_FLAGS} go test -v ./... -json | tparse --notests --all -else - @echo "This target can only be executed on Darwin (macOS)." -endif - .PHONY: integration-test integration-test: ## Run integration test @ # DB_HOST points to localhost by default. Override this variable if diff --git a/go.mod b/go.mod index 70b7d8c3a..5c1adb8f8 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/frankban/quicktest v1.14.6 github.com/gabriel-vasile/mimetype v1.4.3 github.com/gage-technologies/mistral-go v1.1.0 + github.com/go-audio/audio v1.0.0 github.com/go-chi/chi/v5 v5.1.0 github.com/go-openapi/strfmt v0.23.0 github.com/go-redis/redismock/v9 v9.2.0 @@ -35,7 +36,6 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 github.com/h2non/filetype v1.1.3 - github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1 github.com/iancoleman/strcase v0.3.0 github.com/influxdata/influxdb-client-go/v2 v2.12.3 github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee @@ -99,15 +99,21 @@ require ( gorm.io/plugin/dbresolver v1.5.1 ) +require github.com/dh1tw/gosamplerate v0.1.2 // indirect + require ( cloud.google.com/go v0.115.0 // indirect cloud.google.com/go/auth v0.7.2 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect + github.com/JalfResi/justext v0.0.0-20221106200834-be571e3e3052 // indirect github.com/PaesslerAG/gval v1.0.0 // indirect + github.com/PuerkitoBio/goquery v1.9.1 github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect github.com/antchfx/htmlquery v1.3.0 // indirect github.com/antchfx/xmlquery v1.3.17 // indirect github.com/antchfx/xpath v1.2.4 // indirect @@ -116,16 +122,30 @@ require ( github.com/aws/aws-sdk-go v1.55.1 // indirect github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect github.com/aws/smithy-go v1.20.3 // indirect + github.com/catalinc/hashcash v0.0.0-20220723060415-5e3ec3e24f67 // indirect + github.com/cenkalti/backoff/v4 v4.2.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect github.com/chromedp/sysutil v1.0.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/deepmap/oapi-codegen v1.8.2 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/dlclark/regexp2 v1.10.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect github.com/emersion/go-sasl v0.0.0-20231106173351-e73c9f7bad43 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7 // indirect + github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect + github.com/fatih/set v0.2.1 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.6.0 // indirect + github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect + github.com/go-audio/riff v1.0.0 // indirect + github.com/go-audio/wav v1.1.0 github.com/go-ini/ini v1.67.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/analysis v0.21.2 // indirect github.com/go-openapi/errors v0.22.0 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect @@ -134,23 +154,41 @@ require ( github.com/go-openapi/spec v0.20.4 // indirect github.com/go-openapi/swag v0.22.4 // indirect github.com/go-openapi/validate v0.21.0 // indirect + github.com/go-resty/resty/v2 v2.12.0 + github.com/go-sql-driver/mysql v1.8.1 github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/goccy/go-json v0.10.3 // indirect + github.com/gogo/googleapis v1.4.1 // indirect + github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/mock v1.6.0 // indirect + github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v23.5.26+incompatible // indirect github.com/google/go-querystring v1.1.0 // indirect github.com/google/s2a-go v0.1.7 // indirect + github.com/google/uuid v1.6.0 github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/gorilla/websocket v1.5.1 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 // indirect github.com/itchyny/timefmt-go v0.1.5 // indirect + github.com/jackc/chunkreader/v2 v2.0.1 // indirect + github.com/jackc/pgconn v1.14.3 + github.com/jackc/pgio v1.0.0 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgproto3/v2 v2.3.3 // indirect + github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect github.com/jackc/puddle/v2 v2.2.1 // indirect + github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect + github.com/jinzhu/inflection v1.0.0 // indirect + github.com/jinzhu/now v1.1.5 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect @@ -159,19 +197,40 @@ require ( github.com/klauspost/cpuid/v2 v2.2.8 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect + github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect + github.com/lib/pq v1.10.9 github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/minio/md5-simd v1.1.2 // indirect + github.com/mitchellh/copystructure v1.2.0 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/oklog/ulid v1.3.1 // indirect + github.com/olekukonko/tablewriter v0.0.4 // indirect + github.com/otiai10/gosseract/v2 v2.4.1 // indirect + github.com/pborman/uuid v1.2.1 // indirect github.com/pierrec/lz4/v4 v4.1.18 // indirect + github.com/pkg/errors v0.9.1 + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/richardlehane/mscfb v1.0.4 // indirect + github.com/richardlehane/msoleps v1.0.3 // indirect github.com/rivo/uniseg v0.4.4 // indirect + github.com/robfig/cron v1.2.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect github.com/rs/xid v1.6.0 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/shopspring/decimal v1.2.0 // indirect + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect + github.com/streamer45/silero-vad-go v0.2.1 + github.com/stretchr/objx v0.5.2 // indirect + github.com/stretchr/testify v1.9.0 github.com/temoto/robotstxt v1.1.2 // indirect github.com/tidwall/gjson v1.14.4 // indirect github.com/tidwall/match v1.1.1 // indirect @@ -185,6 +244,7 @@ require ( github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 // indirect github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 // indirect github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect + github.com/zaf/resample v1.5.0 github.com/zeebo/xxh3 v1.0.2 // indirect gitlab.com/golang-commonmark/html v0.0.0-20191124015941-a22733972181 // indirect gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect @@ -264,10 +324,12 @@ require ( go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.10.0 // indirect golang.org/x/crypto v0.26.0 + golang.org/x/sync v0.8.0 // indirect golang.org/x/sys v0.24.0 // indirect golang.org/x/text v0.17.0 golang.org/x/time v0.5.0 // indirect google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect gopkg.in/yaml.v3 v3.0.1 gorm.io/driver/mysql v1.4.7 // indirect + modernc.org/mathutil v1.5.0 // indirect ) diff --git a/go.sum b/go.sum index b5892a129..ce630fe4d 100644 --- a/go.sum +++ b/go.sum @@ -783,6 +783,8 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= +github.com/dh1tw/gosamplerate v0.1.2 h1:oyqtZk67xB9B4l+vIZCZ3F0RYV/z66W58VOah11/ktI= +github.com/dh1tw/gosamplerate v0.1.2/go.mod h1:zooTyHpoR7hE+FLfdE3yjLHb2QA2NpMusNfuaZqEACM= github.com/dhui/dktest v0.3.10 h1:0frpeeoM9pHouHjhLeZDuDTJ0PqjDTrycaHaMmkJAo8= github.com/dhui/dktest v0.3.10/go.mod h1:h5Enh0nG3Qbo9WjNFRrwmKUaePEBhXMOygbz3Ww7Sz0= github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= @@ -890,6 +892,12 @@ github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2H github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I= github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs= +github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= +github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= +github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= +github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= +github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= +github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs= github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= @@ -1260,8 +1268,6 @@ github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d/go.mod h1:+NfK9FKe github.com/hjson/hjson-go/v4 v4.0.0 h1:wlm6IYYqHjOdXH1gHev4VoXCaW20HdQAGCxdOEEg2cs= github.com/hjson/hjson-go/v4 v4.0.0/go.mod h1:KaYt3bTw3zhBjYqnXkYywcYctk0A2nxeEFTse3rH13E= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= -github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1 h1:oqeURuHQrImMykykqJgFbStlaDXyY7JpXXrwXyjr9ls= -github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1/go.mod h1:tKRg0K9YmfD3eD6KFos+YHIVMouKMzxDSK5XpdxdCUI= github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho= github.com/iancoleman/strcase v0.3.0 h1:nTXanmYxhfFAMjZL34Ov6gkzEsSJZ5DbhxWjvSASxEI= github.com/iancoleman/strcase v0.3.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho= @@ -1548,6 +1554,8 @@ github.com/mitchellh/osext v0.0.0-20151018003038-5e2d6d41470f/go.mod h1:OkQIRizQ github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk= +github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= @@ -1839,6 +1847,8 @@ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cma github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= github.com/stefanberger/go-pkcs11uri v0.0.0-20201008174630-78d3cae3a980/go.mod h1:AO3tvPzVZ/ayst6UlUKUv6rcPQInYe3IknH3jYhAKu8= github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= +github.com/streamer45/silero-vad-go v0.2.1 h1:Li1/tTC4H/3cyw6q4weX+U8GWwEL3lTekK/nYa1Cvuk= +github.com/streamer45/silero-vad-go v0.2.1/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs= github.com/stretchr/objx v0.0.0-20180129172003-8a3f7159479f/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -1947,6 +1957,8 @@ github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5ta github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43/go.mod h1:aX5oPXxHm3bOH+xeAttToC8pqch2ScQN/JoXYupl6xs= github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50/go.mod h1:NUSPSUX/bi6SeDMUh6brw0nXpxHnc96TguQh0+r/ssA= github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f/go.mod h1:GlGEuHIJweS1mbCqG+7vt2nvWLzLLnRHbXz5JKd/Qbg= +github.com/zaf/resample v1.5.0 h1:c3yumHrV1cJoED8ZY2Ai3cehS8s0mJSroA9/vMaUcho= +github.com/zaf/resample v1.5.0/go.mod h1:e4yWalfgRccQrnZSrkIxTqmMCOPhTi1xvYpNpRIB13k= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= diff --git a/pkg/component/operator/audio/v0/.compogen/bottom.mdx b/pkg/component/operator/audio/v0/.compogen/bottom.mdx index 1dee42721..31e5d6f97 100644 --- a/pkg/component/operator/audio/v0/.compogen/bottom.mdx +++ b/pkg/component/operator/audio/v0/.compogen/bottom.mdx @@ -5,36 +5,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p ```yaml version: v1beta component: - audio-spliter: + audio-vad: type: audio - task: TASK_SLICE_AUDIO input: audio: ${variable.audio} - end-time: ${variable.end_time} - start-time: ${variable.start_time} - get-transcription: - type: openai - task: TASK_SPEECH_RECOGNITION + min-silence-duration: 300 + speech-pad: 10 + task: TASK_DETECT_ACTIVITY + audio-segment: + type: audio input: - audio: ${audio-spliter.output.audio} - model: whisper-1 - setup: - api-key: ${secret.INSTILL_SECRET} + audio: ${variable.audio} + segments: ${audio-vad.output.segments} + task: TASK_SEGMENT variable: audio: - title: audio - description: the audio you want to get the transcription from - instill-format: audio/* - end_time: - title: end-time - description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds - instill-format: number - start_time: - title: start-time - description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds - instill-format: number + title: Audio to test + description: Audio to test VAD and extraction + instill-format: audio output: - result: - title: result - value: ${get-transcription.output.text} + samples: + title: Output audio segments + description: Output extracted audio segments + value: ${audio-segment.output.audio-segments} ``` diff --git a/pkg/component/operator/audio/v0/README.mdx b/pkg/component/operator/audio/v0/README.mdx index 12b170085..89afa080d 100644 --- a/pkg/component/operator/audio/v0/README.mdx +++ b/pkg/component/operator/audio/v0/README.mdx @@ -5,10 +5,10 @@ draft: false description: "Learn about how to set up a VDP Audio component https://github.com/instill-ai/instill-core" --- -The Audio component is an operator component that allows users to extract and manipulate audio from different sources. +The Audio component is an operator component that allows users to operate audio data. It can carry out the following tasks: -- [Chunk Audios](#chunk-audios) -- [Slice Audio](#slice-audio) +- [Detect Activity](#detect-activity) +- [Segment](#segment) @@ -29,17 +29,18 @@ The component definition and tasks are defined in the [definition.json](https:// ## Supported Tasks -### Chunk Audios +### Detect Activity -Split audio file into chunks +Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.
| Input | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Task ID (required) | `task` | string | `TASK_CHUNK_AUDIOS` | -| Audio (required) | `audio` | string | Base64 encoded audio file to be split | -| Chunk Count (required) | `chunk-count` | integer | Number of chunks to equally split the audio into | +| Task ID (required) | `task` | string | `TASK_DETECT_ACTIVITY` | +| Audio (required) | `audio` | string | Audio file to analyze for speech activity. | +| Minimum Silence Duration | `min-silence-duration` | integer | Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments. | +| Speech Pad | `speech-pad` | integer | Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech. |
@@ -51,25 +52,52 @@ Split audio file into chunks | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Audios | `audios` | array[string] | A list of base64 encoded audios | +| [Segments](#detect-activity-segments) | `segments` | array[object] | Array of time segments representing detected speech activity. Each segment contains start and end times in seconds. | -### Slice Audio +
+ Output Objects in Detect Activity -Specify a time range to slice an audio file +

Segments

+ +
+ +| Field | Field ID | Type | Note | +| :--- | :--- | :--- | :--- | +| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment. | +| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment. | +
+
+ +### Segment + +Segment audio data into pieces based on the provided time segments.
| Input | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Task ID (required) | `task` | string | `TASK_SLICE_AUDIO` | -| Audio (required) | `audio` | string | Base64 encoded audio file to be sliced | -| Start Time (required) | `start-time` | integer | Start time of the slice in seconds | -| End Time (required) | `end-time` | integer | End time of the slice in seconds | +| Task ID (required) | `task` | string | `TASK_SEGMENT` | +| Audio (required) | `audio` | string | Audio data to segment. | +| [Segments](#segment-segments) (required) | `segments` | array[object] | A list of time segments of audio data. |
+
+ Input Objects in Segment +

Segments

+ +A list of time segments of audio data. + +
+ +| Field | Field ID | Type | Note | +| :--- | :--- | :--- | :--- | +| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment. | +| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment. | +
+
@@ -77,7 +105,7 @@ Specify a time range to slice an audio file | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | -| Audio | `audio` | string | Base64 encoded audio slice | +| Audios | `audio-segments` | array[string] | A list of segmented audio data. | @@ -88,36 +116,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p ```yaml version: v1beta component: - audio-spliter: + audio-vad: type: audio - task: TASK_SLICE_AUDIO input: audio: ${variable.audio} - end-time: ${variable.end_time} - start-time: ${variable.start_time} - get-transcription: - type: openai - task: TASK_SPEECH_RECOGNITION + min-silence-duration: 300 + speech-pad: 10 + task: TASK_DETECT_ACTIVITY + audio-segment: + type: audio input: - audio: ${audio-spliter.output.audio} - model: whisper-1 - setup: - api-key: ${secret.INSTILL_SECRET} + audio: ${variable.audio} + segments: ${audio-vad.output.segments} + task: TASK_SEGMENT variable: audio: - title: audio - description: the audio you want to get the transcription from - instill-format: audio/* - end_time: - title: end-time - description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds - instill-format: number - start_time: - title: start-time - description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds - instill-format: number + title: Audio to test + description: Audio to test VAD and extraction + instill-format: audio output: - result: - title: result - value: ${get-transcription.output.text} + samples: + title: Output audio segments + description: Output extracted audio segments + value: ${audio-segment.output.audio-segments} ``` diff --git a/pkg/component/operator/audio/v0/audio.go b/pkg/component/operator/audio/v0/audio.go new file mode 100644 index 000000000..52eb337b1 --- /dev/null +++ b/pkg/component/operator/audio/v0/audio.go @@ -0,0 +1,53 @@ +package audio + +import ( + "bytes" + "fmt" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" + "github.com/instill-ai/pipeline-backend/pkg/data" + "github.com/instill-ai/pipeline-backend/pkg/data/format" +) + +const ( + sampleRate = 16000 + numChannel = 1 +) + +func decodeAudioWAV(audioData format.Audio) (*audio.IntBuffer, *wav.Decoder, error) { + + wavAudioData := audioData + var err error + if audioData.ContentType().String() != data.WAV { + wavAudioData, err = audioData.Convert(data.WAV) + if err != nil { + return nil, nil, fmt.Errorf("error converting audio data to WAV: %v", err) + } + } + + binary, err := wavAudioData.Binary() + if err != nil { + return nil, nil, fmt.Errorf("error getting binary data for image: %v", err) + } + + dec := wav.NewDecoder(bytes.NewReader(binary.ByteArray())) + if !dec.IsValidFile() { + return nil, nil, fmt.Errorf("invalid WAV file") + } + + audioBuf := &audio.IntBuffer{ + Format: &audio.Format{ + NumChannels: int(dec.NumChans), + SampleRate: int(dec.SampleRate), + }, + Data: make([]int, len(binary.ByteArray())), + SourceBitDepth: int(dec.BitDepth), + } + + if _, err := dec.PCMBuffer(audioBuf); err != nil { + return nil, nil, fmt.Errorf("reading audio data: %w", err) + } + + return audioBuf, dec, nil +} diff --git a/pkg/component/operator/audio/v0/audio_operation.go b/pkg/component/operator/audio/v0/audio_operation.go deleted file mode 100644 index 6559b6cbe..000000000 --- a/pkg/component/operator/audio/v0/audio_operation.go +++ /dev/null @@ -1,155 +0,0 @@ -package audio - -import ( - "bytes" - "encoding/base64" - "fmt" - "time" - - "github.com/iFaceless/godub" - "github.com/iFaceless/godub/wav" - "google.golang.org/protobuf/types/known/structpb" - - "github.com/instill-ai/pipeline-backend/pkg/component/base" - "github.com/instill-ai/pipeline-backend/pkg/component/internal/util" -) - -type ChunkAudiosInput struct { - Audio Audio `json:"audio"` - ChunkCount int `json:"chunk-count"` -} - -type ChunkAudiosOutput struct { - Audios []Audio `json:"audios"` -} - -type SliceAudioInput struct { - Audio Audio `json:"audio"` - StartTime int `json:"start-time"` - EndTime int `json:"end-time"` -} - -type SliceAudioOutput struct { - Audio Audio `json:"audio"` -} - -type ConcatenateInput struct { - Audios []Audio `json:"audios"` -} - -type ConcatenateOutput struct { - Audio Audio `json:"audio"` -} - -// Base64 encoded audio -type Audio string - -func chunkAudios(input *structpb.Struct) (*structpb.Struct, error) { - - var inputStruct ChunkAudiosInput - - err := base.ConvertFromStructpb(input, &inputStruct) - if err != nil { - return nil, err - } - - buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio))) - if err != nil { - return nil, err - } - - segment, err := godub.NewLoader().Load(bytes.NewReader(buf)) - - if err != nil { - return nil, fmt.Errorf("failed to load audio: %w", err) - } - - duration := segment.Duration() - - chunkSeconds := float64(duration) / float64(inputStruct.ChunkCount) - - var audioSegments []*godub.AudioSegment - - var startTime time.Duration - for i := 0; i < inputStruct.ChunkCount; i++ { - startTime = getStartTime(chunkSeconds, i) - endTime := getEndTime(chunkSeconds, i, inputStruct.ChunkCount, duration) - - slicedSegment, err := segment.Slice(startTime, endTime) - if err != nil { - return nil, fmt.Errorf("failed to slice audio: %w in chunk %v", err, i) - } - audioSegments = append(audioSegments, slicedSegment) - } - - var audios []Audio - prefix := "data:audio/wav;base64," - for _, segment := range audioSegments { - var wavBuf bytes.Buffer - err = wav.Encode(&wavBuf, segment.AsWaveAudio()) - - if err != nil { - return nil, fmt.Errorf("failed to encode audio to wav: %w", err) - } - - audios = append(audios, Audio(prefix+base64.StdEncoding.EncodeToString(wavBuf.Bytes()))) - } - - output := ChunkAudiosOutput{ - Audios: audios, - } - - return base.ConvertToStructpb(output) -} - -func sliceAudio(input *structpb.Struct) (*structpb.Struct, error) { - - var inputStruct SliceAudioInput - - err := base.ConvertFromStructpb(input, &inputStruct) - if err != nil { - return nil, err - } - - buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio))) - if err != nil { - return nil, err - } - - segment, err := godub.NewLoader().Load(bytes.NewReader(buf)) - - if err != nil { - return nil, fmt.Errorf("failed to load audio: %w", err) - } - - startTime := time.Duration(inputStruct.StartTime) * time.Second - endTime := time.Duration(inputStruct.EndTime) * time.Second - - slicedSegment, err := segment.Slice(startTime, endTime) - if err != nil { - return nil, fmt.Errorf("failed to slice audio: %w", err) - } - - var wavBuf bytes.Buffer - err = wav.Encode(&wavBuf, slicedSegment.AsWaveAudio()) - if err != nil { - return nil, fmt.Errorf("failed to encode audio to wav: %w", err) - } - - output := SliceAudioOutput{ - Audio: Audio("data:audio/wav;base64," + base64.StdEncoding.EncodeToString(wavBuf.Bytes())), - } - - return base.ConvertToStructpb(output) -} - -func getStartTime(chunkSeconds float64, i int) time.Duration { - return time.Duration(chunkSeconds * float64(i)) -} - -func getEndTime(chunkSeconds float64, i, totalCount int, duration time.Duration) time.Duration { - if i == totalCount-1 { - return duration - } - return time.Duration(chunkSeconds * float64(i+1)) -} diff --git a/pkg/component/operator/audio/v0/config/definition.json b/pkg/component/operator/audio/v0/config/definition.json index 6e77ddad5..15f83127a 100644 --- a/pkg/component/operator/audio/v0/config/definition.json +++ b/pkg/component/operator/audio/v0/config/definition.json @@ -1,7 +1,7 @@ { "availableTasks": [ - "TASK_CHUNK_AUDIOS", - "TASK_SLICE_AUDIO" + "TASK_DETECT_ACTIVITY", + "TASK_SEGMENT" ], "documentationUrl": "https://www.instill.tech/docs/component/operator/audio", "icon": "assets/audio.svg", @@ -13,6 +13,6 @@ "uid": "b5c75caa-9261-4757-bfbf-12e908f59f16", "version": "0.1.0", "sourceUrl": "https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/operator/audio/v0", - "description": "Extract and manipulate audio from different sources", + "description": "Operate audio data.", "releaseStage": "RELEASE_STAGE_ALPHA" } diff --git a/pkg/component/operator/audio/v0/config/tasks.json b/pkg/component/operator/audio/v0/config/tasks.json index b2fa6de39..3eedbc0c3 100644 --- a/pkg/component/operator/audio/v0/config/tasks.json +++ b/pkg/component/operator/audio/v0/config/tasks.json @@ -1,18 +1,41 @@ { - "TASK_CHUNK_AUDIOS": { - "instillShortDescription": "Split audio file into chunks", - "input": { - "description": "Audio file to split", - "instillEditOnNodeFields": [ - "audio", - "chunk-count" + "$defs": { + "segment": { + "properties": { + "start-time": { + "title": "Start Time", + "type": "number", + "description": "The number of seconds from the beginning of the audio file to the start of this segment.", + "instillFormat": "number", + "instillUIOrder": 0 + }, + "end-time": { + "title": "End Time", + "type": "number", + "description": "The number of seconds from the beginning of the audio file to the end of this segment.", + "instillFormat": "number", + "instillUIOrder": 1 + } + }, + "required": [ + "start-time", + "end-time" ], + "title": "Segment", + "type": "object", + "description": "A time segment of audio data, defined by its start and end times in seconds." + } + }, + "TASK_DETECT_ACTIVITY": { + "instillShortDescription": "Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.", + "input": { + "description": "Input", "instillUIOrder": 0, "properties": { "audio": { - "description": "Base64 encoded audio file to be split", + "description": "Audio file to analyze for speech activity.", "instillAcceptFormats": [ - "audio/*", + "audio/wav", "application/octet-stream" ], "instillUIOrder": 0, @@ -22,63 +45,67 @@ "title": "Audio", "type": "string" }, - "chunk-count": { - "description": "Number of chunks to equally split the audio into", + "min-silence-duration": { + "description": "Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments.", "instillAcceptFormats": [ "integer", "number" ], - "instillUpstreamTypes": [ - "reference", - "value" - ], "instillUIOrder": 1, - "title": "Chunk count", - "type": "integer" + "type": "integer", + "minimum": 0, + "title": "Minimum Silence Duration", + "default": 100 + }, + "speech-pad": { + "description": "Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech.", + "instillAcceptFormats": [ + "integer", + "number" + ], + "instillUIOrder": 2, + "type": "integer", + "minimum": 0, + "title": "Speech Pad", + "default": 30 } }, "required": [ - "audio", - "chunk-count" + "audio" ], "title": "Input", "type": "object" }, "output": { + "description": "Output", "instillUIOrder": 0, "properties": { - "audios": { - "description": "A list of base64 encoded audios", - "instillFormat": "array:audio/wav", + "segments": { + "description": "Array of time segments representing detected speech activity. Each segment contains start and end times in seconds.", + "instillFormat": "array:object", "instillUIOrder": 0, "items": { - "type": "string", - "title": "Audio" + "$ref": "#/$defs/segment" }, - "title": "Audios", + "title": "Segments", "type": "array" } }, "required": [ - "audios" + "segments" ], "title": "Output", "type": "object" } }, - "TASK_SLICE_AUDIO": { - "instillShortDescription": "Specify a time range to slice an audio file", + "TASK_SEGMENT": { + "instillShortDescription": "Segment audio data into pieces based on the provided time segments.", "input": { - "description": "Audio file to slice", - "instillEditOnNodeFields": [ - "audio", - "start-time", - "end-time" - ], + "description": "Input", "instillUIOrder": 0, "properties": { "audio": { - "description": "Base64 encoded audio file to be sliced", + "description": "Audio data to segment.", "instillAcceptFormats": [ "audio/*", "application/octet-stream" @@ -90,56 +117,42 @@ "title": "Audio", "type": "string" }, - "start-time": { - "description": "Start time of the slice in seconds", - "instillAcceptFormats": [ - "integer", - "number" - ], - "instillUpstreamTypes": [ - "reference", - "value" - ], + "segments": { + "description": "A list of time segments of audio data.", + "instillFormat": "array:object", "instillUIOrder": 1, - "title": "Start time", - "type": "integer" - }, - "end-time": { - "description": "End time of the slice in seconds", - "instillAcceptFormats": [ - "integer", - "number" - ], - "instillUpstreamTypes": [ - "reference", - "value" - ], - "instillUIOrder": 2, - "title": "End time", - "type": "integer" + "items": { + "$ref": "#/$defs/segment" + }, + "title": "Segments", + "type": "array" } }, "required": [ "audio", - "start-time", - "end-time" + "segments" ], "title": "Input", "type": "object" }, "output": { + "description": "Output", "instillUIOrder": 0, "properties": { - "audio": { - "description": "Base64 encoded audio slice", - "instillFormat": "audio/wav", + "audio-segments": { + "description": "A list of segmented audio data.", + "instillFormat": "array:audio/*", "instillUIOrder": 0, - "title": "Audio", - "type": "string" + "items": { + "type": "string", + "title": "Audio" + }, + "title": "Audios", + "type": "array" } }, "required": [ - "audio" + "audio-segments" ], "title": "Output", "type": "object" diff --git a/pkg/component/operator/audio/v0/io.go b/pkg/component/operator/audio/v0/io.go new file mode 100644 index 000000000..5dffc7e63 --- /dev/null +++ b/pkg/component/operator/audio/v0/io.go @@ -0,0 +1,29 @@ +package audio + +import ( + "github.com/instill-ai/pipeline-backend/pkg/data/format" +) + +type segmentData struct { + StartTime float64 `instill:"start-time"` + EndTime float64 `instill:"end-time"` +} + +type detectActivityInput struct { + Audio format.Audio `instill:"audio"` + MinSilenceDuration int `instill:"min-silence-duration"` + SpeechPad int `instill:"speech-pad"` +} + +type detectActivityOutput struct { + Segments []segmentData `instill:"segments"` +} + +type segmentInput struct { + Audio format.Audio `instill:"audio"` + Segments []segmentData `instill:"segments"` +} + +type segmentOutput struct { + AudioSegments []format.Audio `instill:"audio-segments"` +} diff --git a/pkg/component/operator/audio/v0/main.go b/pkg/component/operator/audio/v0/main.go index bb4a6d0d7..ba92308eb 100644 --- a/pkg/component/operator/audio/v0/main.go +++ b/pkg/component/operator/audio/v0/main.go @@ -8,14 +8,12 @@ import ( _ "embed" - "google.golang.org/protobuf/types/known/structpb" - "github.com/instill-ai/pipeline-backend/pkg/component/base" ) const ( - taskChunkAudios string = "TASK_CHUNK_AUDIOS" - taskSliceAudio string = "TASK_SLICE_AUDIO" + taskDetectActivity = "TASK_DETECT_ACTIVITY" + taskSegment = "TASK_SEGMENT" ) var ( @@ -33,8 +31,7 @@ type component struct { type execution struct { base.ComponentExecution - - execute func(*structpb.Struct) (*structpb.Struct, error) + execute func(context.Context, *base.Job) error } func Init(bc base.Component) *component { @@ -54,10 +51,10 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution, e := &execution{ComponentExecution: x} switch x.Task { - case taskChunkAudios: - e.execute = chunkAudios - case taskSliceAudio: - e.execute = sliceAudio + case taskDetectActivity: + e.execute = detectActivity + case taskSegment: + e.execute = segment default: return nil, fmt.Errorf("%s task is not supported", x.Task) } @@ -66,5 +63,5 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution, } func (e *execution) Execute(ctx context.Context, jobs []*base.Job) error { - return base.SequentialExecutor(ctx, jobs, e.execute) + return base.ConcurrentExecutor(ctx, jobs, e.execute) } diff --git a/pkg/component/operator/audio/v0/main_test.go b/pkg/component/operator/audio/v0/main_test.go deleted file mode 100644 index 9aead4749..000000000 --- a/pkg/component/operator/audio/v0/main_test.go +++ /dev/null @@ -1,4 +0,0 @@ -package audio - -// TODO chuang8511 Investigate how to run test case with installing ffmpeg in test env -// It will be arranged according to the product schedule diff --git a/pkg/component/operator/audio/v0/task_detect_activity.go b/pkg/component/operator/audio/v0/task_detect_activity.go new file mode 100644 index 000000000..604294f6a --- /dev/null +++ b/pkg/component/operator/audio/v0/task_detect_activity.go @@ -0,0 +1,137 @@ +//go:build onnx +// +build onnx + +// This task requires ONNX Runtime to be installed. Follow these steps to set it up: +// +// 1. Download ONNX Runtime: +// - Visit the official repository: https://github.com/microsoft/onnxruntime/releases +// - Choose the latest version compatible with your OS architecture +// +// 2. Install ONNX Runtime: +// - Extract the downloaded tar file to a directory (referred to as ONNXRUNTIME_ROOT_PATH) +// - Set up the environment: +// export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include +// export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib +// export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib + +// This task requires the following libraries to be installed: +// - libsoxr-dev (required for github.com/zaf/resample) + +package audio + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "math" + "os" + "path/filepath" + + "github.com/go-audio/audio" + "github.com/streamer45/silero-vad-go/speech" + "github.com/zaf/resample" + + "github.com/instill-ai/pipeline-backend/pkg/component/base" +) + +func detectActivity(ctx context.Context, job *base.Job) error { + var input detectActivityInput + if err := job.Input.ReadData(ctx, &input); err != nil { + return err + } + + audioBuf, dec, err := decodeAudioWAV(input.Audio) + if err != nil { + return err + } + + if audioBuf.Format.NumChannels > numChannel { + audioBuf = toMono(audioBuf) + } + + if audioBuf.Format.SampleRate != sampleRate { + resampledData, err := resampleAudio(audioBuf.AsFloatBuffer().Data, float64(dec.SampleRate), float64(sampleRate), audioBuf.Format.NumChannels) + if err != nil { + return fmt.Errorf("resampling audio: %w", err) + } + audioBuf.Format.SampleRate = sampleRate + audioBuf.Data = resampledData + } + + sd, err := speech.NewDetector(speech.DetectorConfig{ + ModelPath: filepath.Join(os.Getenv("ONNX_MODEL_FOLDER_PATH"), "silero_vad.onnx"), + SampleRate: sampleRate, + Threshold: 0.5, + MinSilenceDurationMs: input.MinSilenceDuration, + SpeechPadMs: input.SpeechPad, + }) + if err != nil { + return fmt.Errorf("creating voice activity detector: %w", err) + } + + defer func() { + if removeErr := sd.Destroy(); removeErr != nil { + if err == nil { + err = fmt.Errorf("destroy speech detector: %w", removeErr) + } + } + }() + + segments, err := sd.Detect(audioBuf.AsFloat32Buffer().Data) + if err != nil { + return fmt.Errorf("detect voice activity: %w", err) + } + + dao := detectActivityOutput{ + Segments: make([]segmentData, len(segments)), + } + for i, s := range segments { + dao.Segments[i] = segmentData{StartTime: s.SpeechStartAt, EndTime: s.SpeechEndAt} + } + + if err := job.Output.WriteData(ctx, dao); err != nil { + return err + } + + return nil +} + +func toMono(buffer *audio.IntBuffer) *audio.IntBuffer { + for i := 0; i < len(buffer.Data)/2; i++ { + buffer.Data[i] = (buffer.Data[2*i] + buffer.Data[2*i+1]) / 2 + } + buffer.Data = buffer.Data[:len(buffer.Data)/2] + buffer.Format.NumChannels = 1 + return buffer +} + +func resampleAudio(input []float64, inputRate, outputRate float64, channels int) ([]int, error) { + var buf bytes.Buffer + resampler, err := resample.New(&buf, inputRate, outputRate, channels, resample.F64, resample.HighQ) + if err != nil { + return nil, fmt.Errorf("creating resampler: %w", err) + } + defer resampler.Close() + + // Convert []float64 to []byte + inputBytes := make([]byte, len(input)*8) + for i, v := range input { + binary.LittleEndian.PutUint64(inputBytes[i*8:], math.Float64bits(v)) + } + + _, err = resampler.Write(inputBytes) + if err != nil { + return nil, fmt.Errorf("writing to resampler: %w", err) + } + + // Convert resampled []byte back to []int + resampledBytes := buf.Bytes() + resampledData := make([]int, len(resampledBytes)/8) + for i := 0; i < len(resampledData); i++ { + resampledFloat := math.Float64frombits(binary.LittleEndian.Uint64(resampledBytes[i*8:])) + resampledData[i] = int(resampledFloat) + } + + return resampledData, nil +} diff --git a/pkg/component/operator/audio/v0/task_detect_activity_nontag.go b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go new file mode 100644 index 000000000..c20f2325b --- /dev/null +++ b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go @@ -0,0 +1,15 @@ +//go:build !onnx +// +build !onnx + +package audio + +import ( + "context" + "fmt" + + "github.com/instill-ai/pipeline-backend/pkg/component/base" +) + +func detectActivity(ctx context.Context, job *base.Job) error { + return fmt.Errorf("the Audio operator wasn't built with onnxruntime") +} diff --git a/pkg/component/operator/audio/v0/task_detect_activity_test.go b/pkg/component/operator/audio/v0/task_detect_activity_test.go new file mode 100644 index 000000000..6b84d67e4 --- /dev/null +++ b/pkg/component/operator/audio/v0/task_detect_activity_test.go @@ -0,0 +1,159 @@ +//go:build onnx +// +build onnx + +package audio + +import ( + "context" + "encoding/json" + "io" + "math" + "os" + "testing" + + "github.com/go-audio/audio" + "github.com/google/go-cmp/cmp" + + qt "github.com/frankban/quicktest" + + "github.com/instill-ai/pipeline-backend/pkg/component/base" + "github.com/instill-ai/pipeline-backend/pkg/component/internal/mock" + "github.com/instill-ai/pipeline-backend/pkg/data" +) + +func TestDetectActivity(t *testing.T) { + c := qt.New(t) + + testCases := []struct { + name string + audioFile string + sampleRate int + threshold float64 + silenceDuration int + speechPad int + wantSegments string + expectedError string + }{ + { + name: "ok - detect voice activity (voice1)", + audioFile: "testdata/voice1.wav", + sampleRate: 16000, + threshold: 0.5, + silenceDuration: 500, + speechPad: 100, + wantSegments: "testdata/voice1-activity-segments.json", + }, + { + name: "ok - detect voice activity (voice2)", + audioFile: "testdata/voice2.wav", + sampleRate: 16000, + threshold: 0.5, + silenceDuration: 500, + speechPad: 30, + wantSegments: "testdata/voice2-activity-segments.json", + }, + } + + for _, tc := range testCases { + c.Run(tc.name, func(c *qt.C) { + component := Init(base.Component{}) + c.Assert(component, qt.IsNotNil) + + execution, err := component.CreateExecution(base.ComponentExecution{ + Component: component, + Task: taskDetectActivity, + }) + c.Assert(err, qt.IsNil) + c.Assert(execution, qt.IsNotNil) + + ir, ow, eh, job := mock.GenerateMockJob(c) + + // Load audio data + audioFile, err := os.Open(tc.audioFile) + c.Assert(err, qt.IsNil) + defer audioFile.Close() + audioData, err := io.ReadAll(audioFile) + c.Assert(err, qt.IsNil) + + ir.ReadDataMock.Set(func(ctx context.Context, input any) error { + switch input := input.(type) { + case *detectActivityInput: + audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav") + c.Assert(err, qt.IsNil) + *input = detectActivityInput{ + Audio: audio, + MinSilenceDuration: tc.silenceDuration, + SpeechPad: tc.speechPad, + } + } + return nil + }) + + var capturedOutput detectActivityOutput + ow.WriteDataMock.Set(func(ctx context.Context, output any) error { + capturedOutput = output.(detectActivityOutput) + return nil + }) + + eh.ErrorMock.Set(func(ctx context.Context, err error) { + c.Assert(err, qt.ErrorMatches, tc.expectedError) + }) + + if tc.expectedError != "" { + ow.WriteDataMock.Optional() + } else { + eh.ErrorMock.Optional() + } + + err = execution.Execute(context.Background(), []*base.Job{job}) + + if tc.expectedError == "" { + c.Assert(err, qt.IsNil) + + // Load expected segments + expectedSegmentsJSONData, err := os.ReadFile(tc.wantSegments) + c.Assert(err, qt.IsNil) + var expectedSegmentsStruct struct { + Segments []segmentData `instill:"segments"` + } + + var segmentsMap map[string]interface{} + err = json.Unmarshal(expectedSegmentsJSONData, &segmentsMap) + c.Assert(err, qt.IsNil) + + jsonValue, err := data.NewJSONValue(segmentsMap) + c.Assert(err, qt.IsNil) + + c.Assert(data.Unmarshal(jsonValue, &expectedSegmentsStruct), qt.IsNil) + expectedSegments := expectedSegmentsStruct.Segments + + c.Assert(capturedOutput.Segments, qt.HasLen, len(expectedSegments)) + + for i, actual := range capturedOutput.Segments { + expected := expectedSegments[i] + c.Assert(actual.StartTime, floatEquals(0.1), expected.StartTime) + c.Assert(actual.EndTime, floatEquals(0.1), expected.EndTime) + } + } + }) + } +} + +// floatEquals is a custom checker for comparing float64 values with an epsilon +func floatEquals(epsilon float64) qt.Checker { + return qt.CmpEquals(cmp.Comparer(func(x, y float64) bool { + return math.Abs(x-y) <= epsilon + })) +} + +func TestToMono(t *testing.T) { + c := qt.New(t) + + stereoBuffer := &audio.IntBuffer{ + Data: []int{1, 2, 3, 4, 5, 6}, + Format: &audio.Format{NumChannels: 2}, + } + monoBuffer := toMono(stereoBuffer) + c.Assert(monoBuffer.Format.NumChannels, qt.Equals, 1) + c.Assert(monoBuffer.Data, qt.DeepEquals, []int{1, 3, 5}) +} diff --git a/pkg/component/operator/audio/v0/task_segment.go b/pkg/component/operator/audio/v0/task_segment.go new file mode 100644 index 000000000..a7a54f605 --- /dev/null +++ b/pkg/component/operator/audio/v0/task_segment.go @@ -0,0 +1,107 @@ +package audio + +import ( + "context" + "fmt" + "io" + "os" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" + + "github.com/instill-ai/pipeline-backend/pkg/component/base" + "github.com/instill-ai/pipeline-backend/pkg/data" + "github.com/instill-ai/pipeline-backend/pkg/data/format" +) + +func segment(ctx context.Context, job *base.Job) error { + + var input segmentInput + if err := job.Input.ReadData(ctx, &input); err != nil { + return err + } + + audioBuf, dec, err := decodeAudioWAV(input.Audio) + if err != nil { + return err + } + + output := segmentOutput{ + AudioSegments: make([]format.Audio, len(input.Segments)), + } + + for i, seg := range input.Segments { + seg, err := extractSegment(audioBuf, seg) + if err != nil { + return err + } + encSeg, err := encodeSegment(seg, audioBuf.Format, dec) + if err != nil { + return err + } + ad, err := data.NewAudioFromBytes(encSeg, "audio/wav", fmt.Sprintf("audio-segment-%d.wav", i)) + if err != nil { + return err + } + output.AudioSegments[i] = ad + } + + if err := job.Output.WriteData(ctx, output); err != nil { + return err + } + + return nil + +} + +func extractSegment(audioBuf *audio.IntBuffer, seg segmentData) ([]int, error) { + startSample := int(seg.StartTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels)) + endSample := int(seg.EndTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels)) + + if startSample < 0 { + startSample = 0 + } + if endSample > len(audioBuf.Data) { + endSample = len(audioBuf.Data) + } + + return audioBuf.Data[startSample:endSample], nil +} + +func encodeSegment(segment []int, format *audio.Format, dec *wav.Decoder) ([]byte, error) { + // Use a temporary file instead of a buffer + tempFile, err := os.CreateTemp("", "audio_segment_*.wav") + if err != nil { + return nil, fmt.Errorf("failed to create temp file: %w", err) + } + defer os.Remove(tempFile.Name()) // Clean up the temp file when we're done + + encoder := wav.NewEncoder(tempFile, format.SampleRate, int(dec.BitDepth), format.NumChannels, int(dec.WavAudioFormat)) + + segmentBuf := &audio.IntBuffer{ + Format: format, + Data: segment, + SourceBitDepth: int(dec.BitDepth), + } + + if err := encoder.Write(segmentBuf); err != nil { + return nil, fmt.Errorf("failed to write segment to buffer: %w", err) + } + + if err := encoder.Close(); err != nil { + return nil, fmt.Errorf("failed to close the encoder: %w", err) + } + + // Read the contents of the temp file + _, err = tempFile.Seek(0, 0) + if err != nil { + return nil, fmt.Errorf("failed to seek to the beginning of temp file: %w", err) + } + + fileContents, err := io.ReadAll(tempFile) + if err != nil { + return nil, fmt.Errorf("failed to read temp file: %w", err) + } + + return fileContents, nil +} diff --git a/pkg/component/operator/audio/v0/task_segment_test.go b/pkg/component/operator/audio/v0/task_segment_test.go new file mode 100644 index 000000000..bc9a0221f --- /dev/null +++ b/pkg/component/operator/audio/v0/task_segment_test.go @@ -0,0 +1,115 @@ +package audio + +import ( + "context" + "encoding/json" + "io" + "os" + "testing" + + qt "github.com/frankban/quicktest" + + "github.com/instill-ai/pipeline-backend/pkg/component/base" + "github.com/instill-ai/pipeline-backend/pkg/component/internal/mock" + "github.com/instill-ai/pipeline-backend/pkg/data" +) + +func TestSegment(t *testing.T) { + c := qt.New(t) + + testCases := []struct { + name string + audioFile string + segmentsFile string + expectedCount int + expectedError string + }{ + { + name: "ok - valid segmentation", + audioFile: "testdata/voice1.wav", + segmentsFile: "testdata/voice1-activity-segments.json", + expectedCount: 5, + }, + } + + for _, tc := range testCases { + c.Run(tc.name, func(c *qt.C) { + component := Init(base.Component{}) + c.Assert(component, qt.IsNotNil) + + execution, err := component.CreateExecution(base.ComponentExecution{ + Component: component, + Task: taskSegment, + }) + c.Assert(err, qt.IsNil) + c.Assert(execution, qt.IsNotNil) + + ir, ow, eh, job := mock.GenerateMockJob(c) + + // Load audio data + audioFile, err := os.Open(tc.audioFile) + c.Assert(err, qt.IsNil) + defer audioFile.Close() + audioData, err := io.ReadAll(audioFile) + c.Assert(err, qt.IsNil) + + // Load segments data + segmentsJSONData, err := os.ReadFile(tc.segmentsFile) + c.Assert(err, qt.IsNil) + var segmentsStruct struct { + Segments []segmentData `instill:"segments"` + } + + var segmentsMap map[string]interface{} + err = json.Unmarshal(segmentsJSONData, &segmentsMap) + c.Assert(err, qt.IsNil) + + jsonValue, err := data.NewJSONValue(segmentsMap) + c.Assert(err, qt.IsNil) + + c.Assert(data.Unmarshal(jsonValue, &segmentsStruct), qt.IsNil) + segments := segmentsStruct.Segments + + ir.ReadDataMock.Set(func(ctx context.Context, input any) error { + switch input := input.(type) { + case *segmentInput: + audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav") + c.Assert(err, qt.IsNil) + *input = segmentInput{ + Audio: audio, + Segments: segments, + } + } + return nil + }) + + var capturedOutput segmentOutput + ow.WriteDataMock.Set(func(ctx context.Context, output any) error { + capturedOutput = output.(segmentOutput) + return nil + }) + + eh.ErrorMock.Set(func(ctx context.Context, err error) { + c.Assert(err, qt.ErrorMatches, tc.expectedError) + }) + + if tc.expectedError != "" { + ow.WriteDataMock.Optional() + } else { + eh.ErrorMock.Optional() + } + + err = execution.Execute(context.Background(), []*base.Job{job}) + + if tc.expectedError == "" { + c.Assert(err, qt.IsNil) + c.Assert(capturedOutput.AudioSegments, qt.HasLen, tc.expectedCount) + + for i, segment := range capturedOutput.AudioSegments { + c.Assert(segment, qt.Not(qt.IsNil), qt.Commentf("Segment %d is nil", i)) + c.Assert(segment.ContentType().String(), qt.Equals, "audio/ogg", qt.Commentf("Segment %d has incorrect MIME type", i)) + } + } + }) + } +} diff --git a/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json new file mode 100644 index 000000000..d72f2c729 --- /dev/null +++ b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json @@ -0,0 +1,24 @@ +{ + "segments": [ + { + "start-time": 1.5, + "end-time": 2.404 + }, + { + "start-time": 3.196, + "end-time": 4.068 + }, + { + "start-time": 4.604, + "end-time": 5.764 + }, + { + "start-time": 6.62, + "end-time": 14.948 + }, + { + "start-time": 15.836, + "end-time": 18.564 + } + ] +} diff --git a/pkg/component/operator/audio/v0/testdata/voice1.wav b/pkg/component/operator/audio/v0/testdata/voice1.wav new file mode 100644 index 000000000..cfc16301e Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice1.wav differ diff --git a/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json new file mode 100644 index 000000000..e6f7d5f57 --- /dev/null +++ b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json @@ -0,0 +1,16 @@ +{ + "segments": [ + { + "start-time": 0.002, + "end-time": 9.406 + }, + { + "start-time": 10.146, + "end-time": 18.782 + }, + { + "start-time": 19.234, + "end-time": 30.878 + } + ] +} diff --git a/pkg/component/operator/audio/v0/testdata/voice2.wav b/pkg/component/operator/audio/v0/testdata/voice2.wav new file mode 100644 index 000000000..0d5dc63de Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice2.wav differ diff --git a/pkg/component/operator/document/v0/convert_test.go b/pkg/component/operator/document/v0/convert_test.go index 10b5985d1..8afe651a2 100644 --- a/pkg/component/operator/document/v0/convert_test.go +++ b/pkg/component/operator/document/v0/convert_test.go @@ -28,22 +28,20 @@ func TestConvertToText(t *testing.T) { expected: ConvertToTextOutput{ Body: "This is test file for markdown", Meta: map[string]string{ - "Custom Metadata": "no", - "Encrypted": "no", - "File size": "15489 bytes", - "Form": "none", - "JavaScript": "no", - "Metadata Stream": "no", - "Optimized": "no", - "PDF version": "1.4", - "Page rot": "0", - "Page size": "596 x 842 pts (A4)", - "Pages": "1", - "Producer": "Skia/PDF m128 Google Docs Renderer", - "Suspects": "no", - "Tagged": "no", - "Title": "Untitled document", - "UserProperties": "no", + "Encrypted": "no", + "File size": "15489 bytes", + "Form": "none", + "JavaScript": "no", + "Optimized": "no", + "PDF version": "1.4", + "Page rot": "0", + "Page size": "596 x 842 pts (A4)", + "Pages": "1", + "Producer": "Skia/PDF m128 Google Docs Renderer", + "Suspects": "no", + "Tagged": "no", + "Title": "Untitled document", + "UserProperties": "no", }, MSecs: 3, }, diff --git a/pkg/component/resources/onnx/silero_vad.onnx b/pkg/component/resources/onnx/silero_vad.onnx new file mode 100644 index 000000000..d0ccd9d7f Binary files /dev/null and b/pkg/component/resources/onnx/silero_vad.onnx differ