diff --git a/.env b/.env
index e152f994b..8d349a073 100644
--- a/.env
+++ b/.env
@@ -12,6 +12,12 @@ PUBLIC_SERVICE_PORT=8081
 DOCKER_BUILDKIT=1
 COMPOSE_DOCKER_CLI_BUILD=1
 
+# ONNX_MODEL_FOLDER_PATH specifies the directory where ONNX models are stored.
+# These models are loaded dynamically at runtime. The path is set relative to
+# the project root, allowing for consistent model loading across different
+# deployment environments.
+ONNX_MODEL_FOLDER_PATH=${PWD}/pkg/component/resources/onnx
+
 # test
 
 # TEST_DBHOST and TEST_DBNAME are used to initialize a separate database for
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index ad681058e..3dde06506 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -85,11 +85,9 @@ will create and migrate a test database to keep these queries isolated from the
 main DB. You can set the database host and name by overriding the `TEST_DBHOST`
 and `TEST_DBNAME` values.
 
-Certain tests depend on the [`docconv`](https://github.com/sajari/docconv)
-package and aren't run by default. You can trigger them by adding the `OCR=true`
-flag to the coverage command. Make sure to install the [package
-dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies)
-first.
+Certain tests depend on external packages and aren't run by default:
+- For [`docconv`](https://github.com/sajari/docconv) tests, add `OCR=true` flag and install its [dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies).
+- For [`onnxruntime`](https://github.com/microsoft/onnxruntime) tests, add `ONNX=true` flag. Follow the [guideline](#set-up-onnx-runtime) to set up ONNX Runtime (Linux only).
 
 #### Run the integration tests
 
@@ -111,6 +109,22 @@ If empty, tests will try to connect to `localhost:5432`.
 $ make rm
 ```
 
+### Set up ONNX Runtime (Linux only)
+
+1. Download the latest [ONNX Runtime release](https://github.com/microsoft/onnxruntime/releases) for your system.
+
+2. Install ONNX Runtime:
+   ```bash
+   sudo mkdir -p /usr/local/onnxruntime
+   sudo tar -xzf onnxruntime-*-*-*.tgz -C /usr/local/onnxruntime --strip-components=1
+   export ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime  
+   export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+   export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+   export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include
+   ```
+
+**Note:** If you don't have sudo access, extract to a user-writeable location (e.g., `~/onnxruntime`), set `ONNXRUNTIME_ROOT_PATH` accordingly, and adjust the environment variables as shown above. No need to create symlinks in this case.
+
 ## Codebase contribution
 
 ### Pre-commit hooks
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index d4a85df7e..2c2c710b6 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -38,10 +38,23 @@ jobs:
 
       - uses: actions/checkout@v3
 
+      - name: Install onnxruntime library and headers
+        run: |
+          export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
+          LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
+          ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
+          wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+          tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+          mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
+          rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+          echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+          echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+          echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
+
       - name: Generate coverage report
         run: |
           make build-dev
-          make coverage DBTEST=true OCR=true
+          make coverage DBTEST=true OCR=true ONNX=true
 
       - name: Upload coverage report
         uses: codecov/codecov-action@v2
diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml
index 2483e2547..0f3f6372b 100644
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@@ -21,8 +21,24 @@ jobs:
         with:
           go-version: ${{ env.GOLANG_VERSION }}
           cache: false
+      - name: Install sorx
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libsoxr-dev
+      - name: Install onnxruntime library and headers
+        run: |
+          export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
+          LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
+          ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
+          wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+          tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
+          mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
+          rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz          
+          echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
+          echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
+          echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
       - name: golangci-lint
         uses: golangci/golangci-lint-action@v6
         with:
-          version: v1.59
-          args: --timeout=10m
+          version: v1.61
+          args: --timeout=10m --build-tags onnx
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9faea3c61..3de7cbc40 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,6 +12,7 @@ repos:
     rev: v0.5.1
     hooks:
       - id: golangci-lint
+        args: ["--build-tags", "onnx"]
       - id: go-mod-tidy
   - repo: https://github.com/pinglin/conventional-pre-commit
     rev: v1.1.0
diff --git a/Dockerfile b/Dockerfile
index 21088b6d6..12f91bbdf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,33 @@
 ARG GOLANG_VERSION=1.22.5
-FROM golang:${GOLANG_VERSION}-alpine3.19 AS build
-
-RUN apk add --no-cache build-base leptonica-dev tesseract-ocr-dev musl-dev
+FROM golang:${GOLANG_VERSION}-bullseye AS build
+
+ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libleptonica-dev \
+    libtesseract-dev \
+    libsoxr-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install ONNX Runtime (latest release)
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+RUN apt update && \
+    apt install -y wget jq && \
+    LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
+    ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
+    wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
+    rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    apt remove -y wget jq && \
+    apt autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
 
 WORKDIR /src
 
@@ -9,51 +35,28 @@ COPY go.mod go.sum ./
 RUN go mod download
 COPY . .
 
-RUN go get github.com/otiai10/gosseract/v2
-
 ARG SERVICE_NAME TARGETOS TARGETARCH
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=ocr,onnx -o /${SERVICE_NAME} ./cmd/main
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-migrate ./cmd/migration
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init ./cmd/init
+
+FROM debian:bullseye-slim
 
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME} ./cmd/main
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME}-worker ./cmd/worker
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-migrate ./cmd/migration
-RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-init ./cmd/init
-
-FROM alpine:3.19
-
-RUN apk add --no-cache \
-    curl \
-    poppler-utils \
-    wv \
-    tidyhtml \
-    libc6-compat \
-    tesseract-ocr \
-    python3 \
-    py3-pip \
-    build-base \
-    python3-dev \
-    libffi-dev \
-    libreoffice \
-    qpdf \
-    msttcorefonts-installer \
-    font-noto \
-    font-noto-cjk \
-    ffmpeg \
-    chromium \
-    && update-ms-fonts \
-    && fc-cache -f \
-    && python3 -m venv /opt/venv \
-    && /opt/venv/bin/pip install --upgrade pip \
-    && /opt/venv/bin/pip install pdfplumber tokenizers \
-    && rm -rf /var/cache/apk/* /var/cache/fontconfig/*
-
-# Download tesseract data
-RUN curl -L https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata \
-    -o /usr/share/tessdata/eng.traineddata
-
-ARG TARGETARCH
-ARG BUILDARCH
-RUN apk add unrtf --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community
+# Install Python, create virtual environment, and install pdfplumber
+RUN apt update && \
+    apt install -y curl python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
+    python3 -m venv /opt/venv && \
+    /opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
+    rm -rf /var/lib/apt/lists/*
 
+# copy ONNX runtime from build stage
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_ROOT_PATH}
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/
 
 USER nobody:nogroup
 
@@ -71,3 +74,7 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-migrate ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
 COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
+
+# Set up ONNX model and environment variable
+COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
+ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
diff --git a/Dockerfile.dev b/Dockerfile.dev
index a34325e65..6e5987dc0 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -1,5 +1,5 @@
 ARG GOLANG_VERSION=1.22.5
-FROM golang:${GOLANG_VERSION}
+FROM golang:${GOLANG_VERSION}-bullseye
 
 ARG SERVICE_NAME
 
@@ -11,18 +11,39 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION
 
 # Install Python, create virtual environment, and install pdfplumber
 RUN apt update && \
-    apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg chromium qpdf && \
+    apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
     python3 -m venv /opt/venv && \
     /opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
     rm -rf /var/lib/apt/lists/*
 
+# Install ONNX Runtime (latest release)
+ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
+RUN apt update && \
+    apt install -y wget jq && \
+    LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
+    ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
+    wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
+    rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
+    apt remove -y wget jq && \
+    apt autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set environment variables and create symlinks for ONNX Runtime
+ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
+ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
+
+# tparse
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/mfridman/tparse@latest
+
 # air
 RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/cosmtrek/air@v1.49
 
 # k6
-RUN go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
-RUN go install github.com/mfridman/tparse@v0.15.0
-RUN xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
+RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6
 
 # -- set up Go
 
@@ -38,6 +59,10 @@ ENV GOENV=/go/.config/go/env
 # required to restore compatibility with those versions.
 ENV GODEBUG=tlsrsakex=1
 
+# Set up ONNX model and environment variable
+COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
+ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
+
 USER nobody:nogroup
 
 ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/Makefile b/Makefile
index 7e92334d0..a4827beb2 100644
--- a/Makefile
+++ b/Makefile
@@ -7,13 +7,6 @@ include .env
 export
 
 GOTEST_FLAGS := CFG_DATABASE_HOST=${TEST_DBHOST} CFG_DATABASE_NAME=${TEST_DBNAME}
-ifeq (${DBTEST}, true)
-	GOTEST_TAGS := -tags=dbtest
-endif
-ifeq (${OCR}, true)
-	GOTEST_TAGS := -tags=ocr
-endif
-
 
 #============================================================================
 
@@ -40,10 +33,10 @@ latest:							## Run latest container
 		echo "Run latest container ${SERVICE_NAME} and ${SERVICE_NAME}-worker. To stop it, run \"make stop\"."
 	@docker run --network=instill-network \
 		--name ${SERVICE_NAME} \
-		-d ${SERVICE_NAME}:latest ./${SERVICE_NAME}
+		-d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}
 	@docker run --network=instill-network \
 		--name ${SERVICE_NAME}-worker \
-		-d ${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker
+		-d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker
 
 .PHONY: rm
 rm:								## Remove all running containers
@@ -60,10 +53,10 @@ build-dev:							## Build dev docker image
 
 .PHONY: build-latest
 build-latest:							## Build latest docker image
-	@docker buildx build \
+	@docker build \
 		--build-arg GOLANG_VERSION=${GOLANG_VERSION} \
 		--build-arg SERVICE_NAME=${SERVICE_NAME} \
-		-t pipeline-backend:latest .
+		-t instill/pipeline-backend:latest .
 
 .PHONY: go-gen
 go-gen:       					## Generate codes
@@ -94,40 +87,32 @@ coverage:
 				rm coverage.out; \
 	fi
 
+# Tests should run in container without local tparse installation.
+# If you encounter container test issues, install tparse locally:
+# go install github.com/mfridman/tparse/cmd/tparse@latest
 .PHONY: test
 test:
-# Ideally, it should be ok to run without installing tparse locally.
-# However, there may be some issues that arise from running the tests
-# in the container. If you encounter any issues, please install tparse
-# locally via `go install github.com/mfridman/tparse/cmd/tparse@latest`
-# and run the tests locally.
-	@if [ "${OCR}" = "true" ]; then \
-		docker run --rm \
-			-v $(PWD):/${SERVICE_NAME} \
-			--user $(id -u):$(id -g) \
-			--entrypoint= \
-			instill/${SERVICE_NAME}:dev \
-				make test-ocr; \
+	@TAGS=""; \
+	if [ "$${OCR}" = "true" ]; then \
+		TAGS="$$TAGS,ocr"; \
+		[ "$$(uname)" = "Darwin" ] && export TESSDATA_PREFIX=$$(dirname $$(brew list tesseract | grep share/tessdata/eng.traineddata)); \
+	fi; \
+	if [ "$${ONNX}" = "true" ]; then \
+		if [ "$$(uname)" = "Darwin" ]; then \
+			echo "ONNX Runtime test is not supported on Darwin (macOS)."; \
+		else \
+			TAGS="$$TAGS,onnx"; \
+		fi; \
+	fi; \
+	TAGS=$${TAGS#,}; \
+	if [ -n "$$TAGS" ]; then \
+		echo "Running tests with tags: $$TAGS"; \
+		go test -v -tags="$$TAGS" ./... -json | tparse --notests --all; \
 	else \
-		docker run --rm \
-			-v $(PWD):/${SERVICE_NAME} \
-			--user $(id -u):$(id -g) \
-			--entrypoint= \
-			instill/${SERVICE_NAME}:dev \
-				go test -v ./... -json | tparse --notests --all;  \
+		echo "Running standard tests"; \
+		go test -v ./... -json | tparse --notests --all; \
 	fi
 
-.PHONY: test-ocr
-test-ocr:
-# Certain component tests require additional dependencies.
-# Install tesseract via `brew install tesseract`
-# Setup `export LIBRARY_PATH="/opt/homebrew/lib"` `export CPATH="/opt/homebrew/include"`
-ifeq ($(shell uname), Darwin)
-	@TESSDATA_PREFIX=$(shell dirname $(shell brew list tesseract | grep share/tessdata/eng.traineddata)) ${GOTEST_FLAGS} go test -v ./... -json | tparse --notests --all
-else
-	@echo "This target can only be executed on Darwin (macOS)."
-endif
-
 .PHONY: integration-test
 integration-test:				## Run integration test
 	@ # DB_HOST points to localhost by default. Override this variable if
diff --git a/go.mod b/go.mod
index 70b7d8c3a..5c1adb8f8 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
 	github.com/frankban/quicktest v1.14.6
 	github.com/gabriel-vasile/mimetype v1.4.3
 	github.com/gage-technologies/mistral-go v1.1.0
+	github.com/go-audio/audio v1.0.0
 	github.com/go-chi/chi/v5 v5.1.0
 	github.com/go-openapi/strfmt v0.23.0
 	github.com/go-redis/redismock/v9 v9.2.0
@@ -35,7 +36,6 @@ require (
 	github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0
 	github.com/h2non/filetype v1.1.3
-	github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1
 	github.com/iancoleman/strcase v0.3.0
 	github.com/influxdata/influxdb-client-go/v2 v2.12.3
 	github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee
@@ -99,15 +99,21 @@ require (
 	gorm.io/plugin/dbresolver v1.5.1
 )
 
+require github.com/dh1tw/gosamplerate v0.1.2 // indirect
+
 require (
 	cloud.google.com/go v0.115.0 // indirect
 	cloud.google.com/go/auth v0.7.2 // indirect
 	cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect
 	cloud.google.com/go/compute/metadata v0.5.0 // indirect
 	filippo.io/edwards25519 v1.1.0 // indirect
+	github.com/JalfResi/justext v0.0.0-20221106200834-be571e3e3052 // indirect
 	github.com/PaesslerAG/gval v1.0.0 // indirect
+	github.com/PuerkitoBio/goquery v1.9.1
 	github.com/PuerkitoBio/purell v1.1.1 // indirect
 	github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
+	github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 // indirect
+	github.com/andybalholm/cascadia v1.3.2 // indirect
 	github.com/antchfx/htmlquery v1.3.0 // indirect
 	github.com/antchfx/xmlquery v1.3.17 // indirect
 	github.com/antchfx/xpath v1.2.4 // indirect
@@ -116,16 +122,30 @@ require (
 	github.com/aws/aws-sdk-go v1.55.1 // indirect
 	github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
 	github.com/aws/smithy-go v1.20.3 // indirect
+	github.com/catalinc/hashcash v0.0.0-20220723060415-5e3ec3e24f67 // indirect
+	github.com/cenkalti/backoff/v4 v4.2.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect
 	github.com/chromedp/sysutil v1.0.0 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/deepmap/oapi-codegen v1.8.2 // indirect
+	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/dlclark/regexp2 v1.10.0 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
 	github.com/emersion/go-sasl v0.0.0-20231106173351-e73c9f7bad43 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
 	github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7 // indirect
+	github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect
+	github.com/fatih/set v0.2.1 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/fsnotify/fsnotify v1.6.0 // indirect
+	github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect
+	github.com/go-audio/riff v1.0.0 // indirect
+	github.com/go-audio/wav v1.1.0
 	github.com/go-ini/ini v1.67.0 // indirect
+	github.com/go-logr/logr v1.4.2 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-openapi/analysis v0.21.2 // indirect
 	github.com/go-openapi/errors v0.22.0 // indirect
 	github.com/go-openapi/jsonpointer v0.19.6 // indirect
@@ -134,23 +154,41 @@ require (
 	github.com/go-openapi/spec v0.20.4 // indirect
 	github.com/go-openapi/swag v0.22.4 // indirect
 	github.com/go-openapi/validate v0.21.0 // indirect
+	github.com/go-resty/resty/v2 v2.12.0
+	github.com/go-sql-driver/mysql v1.8.1
 	github.com/gobwas/glob v0.2.3 // indirect
 	github.com/gobwas/httphead v0.1.0 // indirect
 	github.com/gobwas/pool v0.2.1 // indirect
 	github.com/gobwas/ws v1.4.0 // indirect
 	github.com/goccy/go-json v0.10.3 // indirect
+	github.com/gogo/googleapis v1.4.1 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
 	github.com/golang/mock v1.6.0 // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
 	github.com/golang/snappy v0.0.4 // indirect
 	github.com/google/flatbuffers v23.5.26+incompatible // indirect
 	github.com/google/go-querystring v1.1.0 // indirect
 	github.com/google/s2a-go v0.1.7 // indirect
+	github.com/google/uuid v1.6.0
 	github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
 	github.com/googleapis/gax-go/v2 v2.12.5 // indirect
 	github.com/gorilla/websocket v1.5.1 // indirect
+	github.com/hashicorp/errwrap v1.1.0 // indirect
+	github.com/hashicorp/go-multierror v1.1.1 // indirect
+	github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 // indirect
 	github.com/itchyny/timefmt-go v0.1.5 // indirect
+	github.com/jackc/chunkreader/v2 v2.0.1 // indirect
+	github.com/jackc/pgconn v1.14.3
+	github.com/jackc/pgio v1.0.0 // indirect
+	github.com/jackc/pgpassfile v1.0.0 // indirect
+	github.com/jackc/pgproto3/v2 v2.3.3 // indirect
+	github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect
 	github.com/jackc/puddle/v2 v2.2.1 // indirect
+	github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect
+	github.com/jinzhu/inflection v1.0.0 // indirect
+	github.com/jinzhu/now v1.1.5 // indirect
 	github.com/jmespath/go-jmespath v0.4.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect
@@ -159,19 +197,40 @@ require (
 	github.com/klauspost/cpuid/v2 v2.2.8 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
+	github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
+	github.com/lib/pq v1.10.9
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-runewidth v0.0.15 // indirect
 	github.com/minio/md5-simd v1.1.2 // indirect
+	github.com/mitchellh/copystructure v1.2.0 // indirect
+	github.com/mitchellh/mapstructure v1.5.0 // indirect
+	github.com/mitchellh/reflectwalk v1.0.2 // indirect
+	github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
 	github.com/montanaflynn/stats v0.7.1 // indirect
 	github.com/oklog/ulid v1.3.1 // indirect
+	github.com/olekukonko/tablewriter v0.0.4 // indirect
+	github.com/otiai10/gosseract/v2 v2.4.1 // indirect
+	github.com/pborman/uuid v1.2.1 // indirect
 	github.com/pierrec/lz4/v4 v4.1.18 // indirect
+	github.com/pkg/errors v0.9.1
+	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
+	github.com/richardlehane/mscfb v1.0.4 // indirect
+	github.com/richardlehane/msoleps v1.0.3 // indirect
 	github.com/rivo/uniseg v0.4.4 // indirect
+	github.com/robfig/cron v1.2.0 // indirect
+	github.com/rogpeppe/go-internal v1.11.0 // indirect
 	github.com/rs/xid v1.6.0 // indirect
 	github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
+	github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
 	github.com/shopspring/decimal v1.2.0 // indirect
+	github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
+	github.com/streamer45/silero-vad-go v0.2.1
+	github.com/stretchr/objx v0.5.2 // indirect
+	github.com/stretchr/testify v1.9.0
 	github.com/temoto/robotstxt v1.1.2 // indirect
 	github.com/tidwall/gjson v1.14.4 // indirect
 	github.com/tidwall/match v1.1.1 // indirect
@@ -185,6 +244,7 @@ require (
 	github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 // indirect
 	github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 // indirect
 	github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
+	github.com/zaf/resample v1.5.0
 	github.com/zeebo/xxh3 v1.0.2 // indirect
 	gitlab.com/golang-commonmark/html v0.0.0-20191124015941-a22733972181 // indirect
 	gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
@@ -264,10 +324,12 @@ require (
 	go.uber.org/atomic v1.10.0 // indirect
 	go.uber.org/multierr v1.10.0 // indirect
 	golang.org/x/crypto v0.26.0
+	golang.org/x/sync v0.8.0 // indirect
 	golang.org/x/sys v0.24.0 // indirect
 	golang.org/x/text v0.17.0
 	golang.org/x/time v0.5.0 // indirect
 	google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect
 	gopkg.in/yaml.v3 v3.0.1
 	gorm.io/driver/mysql v1.4.7 // indirect
+	modernc.org/mathutil v1.5.0 // indirect
 )
diff --git a/go.sum b/go.sum
index b5892a129..ce630fe4d 100644
--- a/go.sum
+++ b/go.sum
@@ -783,6 +783,8 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
 github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/dh1tw/gosamplerate v0.1.2 h1:oyqtZk67xB9B4l+vIZCZ3F0RYV/z66W58VOah11/ktI=
+github.com/dh1tw/gosamplerate v0.1.2/go.mod h1:zooTyHpoR7hE+FLfdE3yjLHb2QA2NpMusNfuaZqEACM=
 github.com/dhui/dktest v0.3.10 h1:0frpeeoM9pHouHjhLeZDuDTJ0PqjDTrycaHaMmkJAo8=
 github.com/dhui/dktest v0.3.10/go.mod h1:h5Enh0nG3Qbo9WjNFRrwmKUaePEBhXMOygbz3Ww7Sz0=
 github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4=
@@ -890,6 +892,12 @@ github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2H
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I=
 github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs=
+github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
+github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
+github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
+github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
+github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
+github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs=
 github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw=
 github.com/go-chi/chi/v5 v5.1.0/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
@@ -1260,8 +1268,6 @@ github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d/go.mod h1:+NfK9FKe
 github.com/hjson/hjson-go/v4 v4.0.0 h1:wlm6IYYqHjOdXH1gHev4VoXCaW20HdQAGCxdOEEg2cs=
 github.com/hjson/hjson-go/v4 v4.0.0/go.mod h1:KaYt3bTw3zhBjYqnXkYywcYctk0A2nxeEFTse3rH13E=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
-github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1 h1:oqeURuHQrImMykykqJgFbStlaDXyY7JpXXrwXyjr9ls=
-github.com/iFaceless/godub v0.0.0-20200728093528-a30bb4d1a0f1/go.mod h1:tKRg0K9YmfD3eD6KFos+YHIVMouKMzxDSK5XpdxdCUI=
 github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
 github.com/iancoleman/strcase v0.3.0 h1:nTXanmYxhfFAMjZL34Ov6gkzEsSJZ5DbhxWjvSASxEI=
 github.com/iancoleman/strcase v0.3.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
@@ -1548,6 +1554,8 @@ github.com/mitchellh/osext v0.0.0-20151018003038-5e2d6d41470f/go.mod h1:OkQIRizQ
 github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
 github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
 github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
+github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12 h1:dd7vnTDfjtwCETZDrRe+GPYNLA1jBtbZeyfyE8eZCyk=
+github.com/mjibson/go-dsp v0.0.0-20180508042940-11479a337f12/go.mod h1:i/KKcxEWEO8Yyl11DYafRPKOPVYTrhxiTRigjtEEXZU=
 github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
 github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
 github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc=
@@ -1839,6 +1847,8 @@ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cma
 github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
 github.com/stefanberger/go-pkcs11uri v0.0.0-20201008174630-78d3cae3a980/go.mod h1:AO3tvPzVZ/ayst6UlUKUv6rcPQInYe3IknH3jYhAKu8=
 github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
+github.com/streamer45/silero-vad-go v0.2.1 h1:Li1/tTC4H/3cyw6q4weX+U8GWwEL3lTekK/nYa1Cvuk=
+github.com/streamer45/silero-vad-go v0.2.1/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
 github.com/stretchr/objx v0.0.0-20180129172003-8a3f7159479f/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -1947,6 +1957,8 @@ github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5ta
 github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43/go.mod h1:aX5oPXxHm3bOH+xeAttToC8pqch2ScQN/JoXYupl6xs=
 github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50/go.mod h1:NUSPSUX/bi6SeDMUh6brw0nXpxHnc96TguQh0+r/ssA=
 github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f/go.mod h1:GlGEuHIJweS1mbCqG+7vt2nvWLzLLnRHbXz5JKd/Qbg=
+github.com/zaf/resample v1.5.0 h1:c3yumHrV1cJoED8ZY2Ai3cehS8s0mJSroA9/vMaUcho=
+github.com/zaf/resample v1.5.0/go.mod h1:e4yWalfgRccQrnZSrkIxTqmMCOPhTi1xvYpNpRIB13k=
 github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
 github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
 github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
diff --git a/pkg/component/operator/audio/v0/.compogen/bottom.mdx b/pkg/component/operator/audio/v0/.compogen/bottom.mdx
index 1dee42721..31e5d6f97 100644
--- a/pkg/component/operator/audio/v0/.compogen/bottom.mdx
+++ b/pkg/component/operator/audio/v0/.compogen/bottom.mdx
@@ -5,36 +5,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p
 ```yaml
 version: v1beta
 component:
-  audio-spliter:
+  audio-vad:
     type: audio
-    task: TASK_SLICE_AUDIO
     input:
       audio: ${variable.audio}
-      end-time: ${variable.end_time}
-      start-time: ${variable.start_time}
-  get-transcription:
-    type: openai
-    task: TASK_SPEECH_RECOGNITION
+      min-silence-duration: 300
+      speech-pad: 10
+    task: TASK_DETECT_ACTIVITY
+  audio-segment:
+    type: audio
     input:
-      audio: ${audio-spliter.output.audio}
-      model: whisper-1
-    setup:
-      api-key: ${secret.INSTILL_SECRET}
+      audio: ${variable.audio}
+      segments: ${audio-vad.output.segments}    
+    task: TASK_SEGMENT
 variable:
   audio:
-    title: audio
-    description: the audio you want to get the transcription from
-    instill-format: audio/*
-  end_time:
-    title: end-time
-    description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds
-    instill-format: number
-  start_time:
-    title: start-time
-    description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds
-    instill-format: number
+    title: Audio to test
+    description: Audio to test VAD and extraction
+    instill-format: audio
 output:
-  result:
-    title: result
-    value: ${get-transcription.output.text}
+  samples:
+    title: Output audio segments
+    description: Output extracted audio segments    
+    value: ${audio-segment.output.audio-segments}  
 ```
diff --git a/pkg/component/operator/audio/v0/README.mdx b/pkg/component/operator/audio/v0/README.mdx
index 12b170085..89afa080d 100644
--- a/pkg/component/operator/audio/v0/README.mdx
+++ b/pkg/component/operator/audio/v0/README.mdx
@@ -5,10 +5,10 @@ draft: false
 description: "Learn about how to set up a VDP Audio component https://github.com/instill-ai/instill-core"
 ---
 
-The Audio component is an operator component that allows users to extract and manipulate audio from different sources.
+The Audio component is an operator component that allows users to operate audio data.
 It can carry out the following tasks:
-- [Chunk Audios](#chunk-audios)
-- [Slice Audio](#slice-audio)
+- [Detect Activity](#detect-activity)
+- [Segment](#segment)
 
 
 
@@ -29,17 +29,18 @@ The component definition and tasks are defined in the [definition.json](https://
 
 ## Supported Tasks
 
-### Chunk Audios
+### Detect Activity
 
-Split audio file into chunks
+Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.
 
 <div class="markdown-col-no-wrap" data-col-1 data-col-2>
 
 | Input | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
-| Task ID (required) | `task` | string | `TASK_CHUNK_AUDIOS` |
-| Audio (required) | `audio` | string | Base64 encoded audio file to be split |
-| Chunk Count (required) | `chunk-count` | integer | Number of chunks to equally split the audio into |
+| Task ID (required) | `task` | string | `TASK_DETECT_ACTIVITY` |
+| Audio (required) | `audio` | string | Audio file to analyze for speech activity. |
+| Minimum Silence Duration | `min-silence-duration` | integer | Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments. |
+| Speech Pad | `speech-pad` | integer | Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech. |
 </div>
 
 
@@ -51,25 +52,52 @@ Split audio file into chunks
 
 | Output | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
-| Audios | `audios` | array[string] | A list of base64 encoded audios |
+| [Segments](#detect-activity-segments) | `segments` | array[object] | Array of time segments representing detected speech activity. Each segment contains start and end times in seconds. |
 </div>
 
-### Slice Audio
+<details>
+<summary> Output Objects in Detect Activity</summary>
 
-Specify a time range to slice an audio file
+<h4 id="detect-activity-segments">Segments</h4>
+
+<div class="markdown-col-no-wrap" data-col-1 data-col-2>
+
+| Field | Field ID | Type | Note |
+| :--- | :--- | :--- | :--- |
+| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment. |
+| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment. |
+</div>
+</details>
+
+### Segment
+
+Segment audio data into pieces based on the provided time segments.
 
 <div class="markdown-col-no-wrap" data-col-1 data-col-2>
 
 | Input | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
-| Task ID (required) | `task` | string | `TASK_SLICE_AUDIO` |
-| Audio (required) | `audio` | string | Base64 encoded audio file to be sliced |
-| Start Time (required) | `start-time` | integer | Start time of the slice in seconds |
-| End Time (required) | `end-time` | integer | End time of the slice in seconds |
+| Task ID (required) | `task` | string | `TASK_SEGMENT` |
+| Audio (required) | `audio` | string | Audio data to segment. |
+| [Segments](#segment-segments) (required) | `segments` | array[object] | A list of time segments of audio data. |
 </div>
 
 
+<details>
+<summary> Input Objects in Segment</summary>
 
+<h4 id="segment-segments">Segments</h4>
+
+A list of time segments of audio data.
+
+<div class="markdown-col-no-wrap" data-col-1 data-col-2>
+
+| Field | Field ID | Type | Note |
+| :--- | :--- | :--- | :--- |
+| End Time | `end-time` | number | The number of seconds from the beginning of the audio file to the end of this segment.  |
+| Start Time | `start-time` | number | The number of seconds from the beginning of the audio file to the start of this segment.  |
+</div>
+</details>
 
 
 
@@ -77,7 +105,7 @@ Specify a time range to slice an audio file
 
 | Output | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
-| Audio | `audio` | string | Base64 encoded audio slice |
+| Audios | `audio-segments` | array[string] | A list of segmented audio data. |
 </div>
 
 
@@ -88,36 +116,27 @@ Recipe for the [Audio Transcription Generator](https://instill.tech/instill-ai/p
 ```yaml
 version: v1beta
 component:
-  audio-spliter:
+  audio-vad:
     type: audio
-    task: TASK_SLICE_AUDIO
     input:
       audio: ${variable.audio}
-      end-time: ${variable.end_time}
-      start-time: ${variable.start_time}
-  get-transcription:
-    type: openai
-    task: TASK_SPEECH_RECOGNITION
+      min-silence-duration: 300
+      speech-pad: 10
+    task: TASK_DETECT_ACTIVITY
+  audio-segment:
+    type: audio
     input:
-      audio: ${audio-spliter.output.audio}
-      model: whisper-1
-    setup:
-      api-key: ${secret.INSTILL_SECRET}
+      audio: ${variable.audio}
+      segments: ${audio-vad.output.segments}    
+    task: TASK_SEGMENT
 variable:
   audio:
-    title: audio
-    description: the audio you want to get the transcription from
-    instill-format: audio/*
-  end_time:
-    title: end-time
-    description: the end time you want to extract in seconds i.e. 2 mins is 120 seconds
-    instill-format: number
-  start_time:
-    title: start-time
-    description: the start time you want to extract in seconds i.e. 2 mins is 120 seconds
-    instill-format: number
+    title: Audio to test
+    description: Audio to test VAD and extraction
+    instill-format: audio
 output:
-  result:
-    title: result
-    value: ${get-transcription.output.text}
+  samples:
+    title: Output audio segments
+    description: Output extracted audio segments    
+    value: ${audio-segment.output.audio-segments}  
 ```
diff --git a/pkg/component/operator/audio/v0/audio.go b/pkg/component/operator/audio/v0/audio.go
new file mode 100644
index 000000000..52eb337b1
--- /dev/null
+++ b/pkg/component/operator/audio/v0/audio.go
@@ -0,0 +1,53 @@
+package audio
+
+import (
+	"bytes"
+	"fmt"
+
+	"github.com/go-audio/audio"
+	"github.com/go-audio/wav"
+	"github.com/instill-ai/pipeline-backend/pkg/data"
+	"github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+const (
+	sampleRate = 16000
+	numChannel = 1
+)
+
+func decodeAudioWAV(audioData format.Audio) (*audio.IntBuffer, *wav.Decoder, error) {
+
+	wavAudioData := audioData
+	var err error
+	if audioData.ContentType().String() != data.WAV {
+		wavAudioData, err = audioData.Convert(data.WAV)
+		if err != nil {
+			return nil, nil, fmt.Errorf("error converting audio data to WAV: %v", err)
+		}
+	}
+
+	binary, err := wavAudioData.Binary()
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting binary data for image: %v", err)
+	}
+
+	dec := wav.NewDecoder(bytes.NewReader(binary.ByteArray()))
+	if !dec.IsValidFile() {
+		return nil, nil, fmt.Errorf("invalid WAV file")
+	}
+
+	audioBuf := &audio.IntBuffer{
+		Format: &audio.Format{
+			NumChannels: int(dec.NumChans),
+			SampleRate:  int(dec.SampleRate),
+		},
+		Data:           make([]int, len(binary.ByteArray())),
+		SourceBitDepth: int(dec.BitDepth),
+	}
+
+	if _, err := dec.PCMBuffer(audioBuf); err != nil {
+		return nil, nil, fmt.Errorf("reading audio data: %w", err)
+	}
+
+	return audioBuf, dec, nil
+}
diff --git a/pkg/component/operator/audio/v0/audio_operation.go b/pkg/component/operator/audio/v0/audio_operation.go
deleted file mode 100644
index 6559b6cbe..000000000
--- a/pkg/component/operator/audio/v0/audio_operation.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package audio
-
-import (
-	"bytes"
-	"encoding/base64"
-	"fmt"
-	"time"
-
-	"github.com/iFaceless/godub"
-	"github.com/iFaceless/godub/wav"
-	"google.golang.org/protobuf/types/known/structpb"
-
-	"github.com/instill-ai/pipeline-backend/pkg/component/base"
-	"github.com/instill-ai/pipeline-backend/pkg/component/internal/util"
-)
-
-type ChunkAudiosInput struct {
-	Audio      Audio `json:"audio"`
-	ChunkCount int   `json:"chunk-count"`
-}
-
-type ChunkAudiosOutput struct {
-	Audios []Audio `json:"audios"`
-}
-
-type SliceAudioInput struct {
-	Audio     Audio `json:"audio"`
-	StartTime int   `json:"start-time"`
-	EndTime   int   `json:"end-time"`
-}
-
-type SliceAudioOutput struct {
-	Audio Audio `json:"audio"`
-}
-
-type ConcatenateInput struct {
-	Audios []Audio `json:"audios"`
-}
-
-type ConcatenateOutput struct {
-	Audio Audio `json:"audio"`
-}
-
-// Base64 encoded audio
-type Audio string
-
-func chunkAudios(input *structpb.Struct) (*structpb.Struct, error) {
-
-	var inputStruct ChunkAudiosInput
-
-	err := base.ConvertFromStructpb(input, &inputStruct)
-	if err != nil {
-		return nil, err
-	}
-
-	buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio)))
-	if err != nil {
-		return nil, err
-	}
-
-	segment, err := godub.NewLoader().Load(bytes.NewReader(buf))
-
-	if err != nil {
-		return nil, fmt.Errorf("failed to load audio: %w", err)
-	}
-
-	duration := segment.Duration()
-
-	chunkSeconds := float64(duration) / float64(inputStruct.ChunkCount)
-
-	var audioSegments []*godub.AudioSegment
-
-	var startTime time.Duration
-	for i := 0; i < inputStruct.ChunkCount; i++ {
-		startTime = getStartTime(chunkSeconds, i)
-		endTime := getEndTime(chunkSeconds, i, inputStruct.ChunkCount, duration)
-
-		slicedSegment, err := segment.Slice(startTime, endTime)
-		if err != nil {
-			return nil, fmt.Errorf("failed to slice audio: %w in chunk %v", err, i)
-		}
-		audioSegments = append(audioSegments, slicedSegment)
-	}
-
-	var audios []Audio
-	prefix := "data:audio/wav;base64,"
-	for _, segment := range audioSegments {
-		var wavBuf bytes.Buffer
-		err = wav.Encode(&wavBuf, segment.AsWaveAudio())
-
-		if err != nil {
-			return nil, fmt.Errorf("failed to encode audio to wav: %w", err)
-		}
-
-		audios = append(audios, Audio(prefix+base64.StdEncoding.EncodeToString(wavBuf.Bytes())))
-	}
-
-	output := ChunkAudiosOutput{
-		Audios: audios,
-	}
-
-	return base.ConvertToStructpb(output)
-}
-
-func sliceAudio(input *structpb.Struct) (*structpb.Struct, error) {
-
-	var inputStruct SliceAudioInput
-
-	err := base.ConvertFromStructpb(input, &inputStruct)
-	if err != nil {
-		return nil, err
-	}
-
-	buf, err := base64.StdEncoding.DecodeString(util.TrimBase64Mime(string(inputStruct.Audio)))
-	if err != nil {
-		return nil, err
-	}
-
-	segment, err := godub.NewLoader().Load(bytes.NewReader(buf))
-
-	if err != nil {
-		return nil, fmt.Errorf("failed to load audio: %w", err)
-	}
-
-	startTime := time.Duration(inputStruct.StartTime) * time.Second
-	endTime := time.Duration(inputStruct.EndTime) * time.Second
-
-	slicedSegment, err := segment.Slice(startTime, endTime)
-	if err != nil {
-		return nil, fmt.Errorf("failed to slice audio: %w", err)
-	}
-
-	var wavBuf bytes.Buffer
-	err = wav.Encode(&wavBuf, slicedSegment.AsWaveAudio())
-	if err != nil {
-		return nil, fmt.Errorf("failed to encode audio to wav: %w", err)
-	}
-
-	output := SliceAudioOutput{
-		Audio: Audio("data:audio/wav;base64," + base64.StdEncoding.EncodeToString(wavBuf.Bytes())),
-	}
-
-	return base.ConvertToStructpb(output)
-}
-
-func getStartTime(chunkSeconds float64, i int) time.Duration {
-	return time.Duration(chunkSeconds * float64(i))
-}
-
-func getEndTime(chunkSeconds float64, i, totalCount int, duration time.Duration) time.Duration {
-	if i == totalCount-1 {
-		return duration
-	}
-	return time.Duration(chunkSeconds * float64(i+1))
-}
diff --git a/pkg/component/operator/audio/v0/config/definition.json b/pkg/component/operator/audio/v0/config/definition.json
index 6e77ddad5..15f83127a 100644
--- a/pkg/component/operator/audio/v0/config/definition.json
+++ b/pkg/component/operator/audio/v0/config/definition.json
@@ -1,7 +1,7 @@
 {
   "availableTasks": [
-    "TASK_CHUNK_AUDIOS",
-    "TASK_SLICE_AUDIO"
+    "TASK_DETECT_ACTIVITY",
+    "TASK_SEGMENT"
   ],
   "documentationUrl": "https://www.instill.tech/docs/component/operator/audio",
   "icon": "assets/audio.svg",
@@ -13,6 +13,6 @@
   "uid": "b5c75caa-9261-4757-bfbf-12e908f59f16",
   "version": "0.1.0",
   "sourceUrl": "https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/operator/audio/v0",
-  "description": "Extract and manipulate audio from different sources",
+  "description": "Operate audio data.",
   "releaseStage": "RELEASE_STAGE_ALPHA"
 }
diff --git a/pkg/component/operator/audio/v0/config/tasks.json b/pkg/component/operator/audio/v0/config/tasks.json
index b2fa6de39..3eedbc0c3 100644
--- a/pkg/component/operator/audio/v0/config/tasks.json
+++ b/pkg/component/operator/audio/v0/config/tasks.json
@@ -1,18 +1,41 @@
 {
-  "TASK_CHUNK_AUDIOS": {
-    "instillShortDescription": "Split audio file into chunks",
-    "input": {
-      "description": "Audio file to split",
-      "instillEditOnNodeFields": [
-        "audio",
-        "chunk-count"
+  "$defs": {
+    "segment": {
+      "properties": {
+        "start-time": {
+          "title": "Start Time",
+          "type": "number",
+          "description": "The number of seconds from the beginning of the audio file to the start of this segment.",
+          "instillFormat": "number",
+          "instillUIOrder": 0
+        },
+        "end-time": {
+          "title": "End Time",
+          "type": "number",
+          "description": "The number of seconds from the beginning of the audio file to the end of this segment.",
+          "instillFormat": "number",
+          "instillUIOrder": 1
+        }
+      },
+      "required": [
+        "start-time",
+        "end-time"
       ],
+      "title": "Segment",
+      "type": "object",
+      "description": "A time segment of audio data, defined by its start and end times in seconds."
+    }
+  },
+  "TASK_DETECT_ACTIVITY": {
+    "instillShortDescription": "Detect speech segments in audio data using Voice Activity Detection (VAD). This task processes the input audio to 16kHz mono format, identifies periods of human speech, and outputs time segments for each detected speech activity.",
+    "input": {
+      "description": "Input",
       "instillUIOrder": 0,
       "properties": {
         "audio": {
-          "description": "Base64 encoded audio file to be split",
+          "description": "Audio file to analyze for speech activity.",
           "instillAcceptFormats": [
-            "audio/*",
+            "audio/wav",
             "application/octet-stream"
           ],
           "instillUIOrder": 0,
@@ -22,63 +45,67 @@
           "title": "Audio",
           "type": "string"
         },
-        "chunk-count": {
-          "description": "Number of chunks to equally split the audio into",
+        "min-silence-duration": {
+          "description": "Minimum duration of silence (in milliseconds) required to split speech segments. Longer values result in fewer, longer segments.",
           "instillAcceptFormats": [
             "integer",
             "number"
           ],
-          "instillUpstreamTypes": [
-            "reference",
-            "value"
-          ],
           "instillUIOrder": 1,
-          "title": "Chunk count",
-          "type": "integer"
+          "type": "integer",
+          "minimum": 0,
+          "title": "Minimum Silence Duration",
+          "default": 100
+        },
+        "speech-pad": {
+          "description": "Additional padding (in milliseconds) added to the start and end of each detected speech segment to prevent cutting off speech.",
+          "instillAcceptFormats": [
+            "integer",
+            "number"
+          ],
+          "instillUIOrder": 2,
+          "type": "integer",
+          "minimum": 0,
+          "title": "Speech Pad",
+          "default": 30
         }
       },
       "required": [
-        "audio",
-        "chunk-count"
+        "audio"
       ],
       "title": "Input",
       "type": "object"
     },
     "output": {
+      "description": "Output",
       "instillUIOrder": 0,
       "properties": {
-        "audios": {
-          "description": "A list of base64 encoded audios",
-          "instillFormat": "array:audio/wav",
+        "segments": {
+          "description": "Array of time segments representing detected speech activity. Each segment contains start and end times in seconds.",
+          "instillFormat": "array:object",
           "instillUIOrder": 0,
           "items": {
-            "type": "string",
-            "title": "Audio"
+            "$ref": "#/$defs/segment"
           },
-          "title": "Audios",
+          "title": "Segments",
           "type": "array"
         }
       },
       "required": [
-        "audios"
+        "segments"
       ],
       "title": "Output",
       "type": "object"
     }
   },
-  "TASK_SLICE_AUDIO": {
-    "instillShortDescription": "Specify a time range to slice an audio file",
+  "TASK_SEGMENT": {
+    "instillShortDescription": "Segment audio data into pieces based on the provided time segments.",
     "input": {
-      "description": "Audio file to slice",
-      "instillEditOnNodeFields": [
-        "audio",
-        "start-time",
-        "end-time"
-      ],
+      "description": "Input",
       "instillUIOrder": 0,
       "properties": {
         "audio": {
-          "description": "Base64 encoded audio file to be sliced",
+          "description": "Audio data to segment.",
           "instillAcceptFormats": [
             "audio/*",
             "application/octet-stream"
@@ -90,56 +117,42 @@
           "title": "Audio",
           "type": "string"
         },
-        "start-time": {
-          "description": "Start time of the slice in seconds",
-          "instillAcceptFormats": [
-            "integer",
-            "number"
-          ],
-          "instillUpstreamTypes": [
-            "reference",
-            "value"
-          ],
+        "segments": {
+          "description": "A list of time segments of audio data.",
+          "instillFormat": "array:object",
           "instillUIOrder": 1,
-          "title": "Start time",
-          "type": "integer"
-        },
-        "end-time": {
-          "description": "End time of the slice in seconds",
-          "instillAcceptFormats": [
-            "integer",
-            "number"
-          ],
-          "instillUpstreamTypes": [
-            "reference",
-            "value"
-          ],
-          "instillUIOrder": 2,
-          "title": "End time",
-          "type": "integer"
+          "items": {
+            "$ref": "#/$defs/segment"
+          },
+          "title": "Segments",
+          "type": "array"
         }
       },
       "required": [
         "audio",
-        "start-time",
-        "end-time"
+        "segments"
       ],
       "title": "Input",
       "type": "object"
     },
     "output": {
+      "description": "Output",
       "instillUIOrder": 0,
       "properties": {
-        "audio": {
-          "description": "Base64 encoded audio slice",
-          "instillFormat": "audio/wav",
+        "audio-segments": {
+          "description": "A list of segmented audio data.",
+          "instillFormat": "array:audio/*",
           "instillUIOrder": 0,
-          "title": "Audio",
-          "type": "string"
+          "items": {
+            "type": "string",
+            "title": "Audio"
+          },
+          "title": "Audios",
+          "type": "array"
         }
       },
       "required": [
-        "audio"
+        "audio-segments"
       ],
       "title": "Output",
       "type": "object"
diff --git a/pkg/component/operator/audio/v0/io.go b/pkg/component/operator/audio/v0/io.go
new file mode 100644
index 000000000..5dffc7e63
--- /dev/null
+++ b/pkg/component/operator/audio/v0/io.go
@@ -0,0 +1,29 @@
+package audio
+
+import (
+	"github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+type segmentData struct {
+	StartTime float64 `instill:"start-time"`
+	EndTime   float64 `instill:"end-time"`
+}
+
+type detectActivityInput struct {
+	Audio              format.Audio `instill:"audio"`
+	MinSilenceDuration int          `instill:"min-silence-duration"`
+	SpeechPad          int          `instill:"speech-pad"`
+}
+
+type detectActivityOutput struct {
+	Segments []segmentData `instill:"segments"`
+}
+
+type segmentInput struct {
+	Audio    format.Audio  `instill:"audio"`
+	Segments []segmentData `instill:"segments"`
+}
+
+type segmentOutput struct {
+	AudioSegments []format.Audio `instill:"audio-segments"`
+}
diff --git a/pkg/component/operator/audio/v0/main.go b/pkg/component/operator/audio/v0/main.go
index bb4a6d0d7..ba92308eb 100644
--- a/pkg/component/operator/audio/v0/main.go
+++ b/pkg/component/operator/audio/v0/main.go
@@ -8,14 +8,12 @@ import (
 
 	_ "embed"
 
-	"google.golang.org/protobuf/types/known/structpb"
-
 	"github.com/instill-ai/pipeline-backend/pkg/component/base"
 )
 
 const (
-	taskChunkAudios string = "TASK_CHUNK_AUDIOS"
-	taskSliceAudio  string = "TASK_SLICE_AUDIO"
+	taskDetectActivity = "TASK_DETECT_ACTIVITY"
+	taskSegment        = "TASK_SEGMENT"
 )
 
 var (
@@ -33,8 +31,7 @@ type component struct {
 
 type execution struct {
 	base.ComponentExecution
-
-	execute func(*structpb.Struct) (*structpb.Struct, error)
+	execute func(context.Context, *base.Job) error
 }
 
 func Init(bc base.Component) *component {
@@ -54,10 +51,10 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
 	e := &execution{ComponentExecution: x}
 
 	switch x.Task {
-	case taskChunkAudios:
-		e.execute = chunkAudios
-	case taskSliceAudio:
-		e.execute = sliceAudio
+	case taskDetectActivity:
+		e.execute = detectActivity
+	case taskSegment:
+		e.execute = segment
 	default:
 		return nil, fmt.Errorf("%s task is not supported", x.Task)
 	}
@@ -66,5 +63,5 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
 }
 
 func (e *execution) Execute(ctx context.Context, jobs []*base.Job) error {
-	return base.SequentialExecutor(ctx, jobs, e.execute)
+	return base.ConcurrentExecutor(ctx, jobs, e.execute)
 }
diff --git a/pkg/component/operator/audio/v0/main_test.go b/pkg/component/operator/audio/v0/main_test.go
deleted file mode 100644
index 9aead4749..000000000
--- a/pkg/component/operator/audio/v0/main_test.go
+++ /dev/null
@@ -1,4 +0,0 @@
-package audio
-
-// TODO chuang8511 Investigate how to run test case with installing ffmpeg in test env
-// It will be arranged according to the product schedule
diff --git a/pkg/component/operator/audio/v0/task_detect_activity.go b/pkg/component/operator/audio/v0/task_detect_activity.go
new file mode 100644
index 000000000..604294f6a
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity.go
@@ -0,0 +1,137 @@
+//go:build onnx
+// +build onnx
+
+// This task requires ONNX Runtime to be installed. Follow these steps to set it up:
+//
+// 1. Download ONNX Runtime:
+//   - Visit the official repository: https://github.com/microsoft/onnxruntime/releases
+//   - Choose the latest version compatible with your OS architecture
+//
+// 2. Install ONNX Runtime:
+//   - Extract the downloaded tar file to a directory (referred to as ONNXRUNTIME_ROOT_PATH)
+//   - Set up the environment:
+//     export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include
+//     export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+//     export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib
+
+// This task requires the following libraries to be installed:
+//   - libsoxr-dev (required for github.com/zaf/resample)
+
+package audio
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+
+	"github.com/go-audio/audio"
+	"github.com/streamer45/silero-vad-go/speech"
+	"github.com/zaf/resample"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+)
+
+func detectActivity(ctx context.Context, job *base.Job) error {
+	var input detectActivityInput
+	if err := job.Input.ReadData(ctx, &input); err != nil {
+		return err
+	}
+
+	audioBuf, dec, err := decodeAudioWAV(input.Audio)
+	if err != nil {
+		return err
+	}
+
+	if audioBuf.Format.NumChannels > numChannel {
+		audioBuf = toMono(audioBuf)
+	}
+
+	if audioBuf.Format.SampleRate != sampleRate {
+		resampledData, err := resampleAudio(audioBuf.AsFloatBuffer().Data, float64(dec.SampleRate), float64(sampleRate), audioBuf.Format.NumChannels)
+		if err != nil {
+			return fmt.Errorf("resampling audio: %w", err)
+		}
+		audioBuf.Format.SampleRate = sampleRate
+		audioBuf.Data = resampledData
+	}
+
+	sd, err := speech.NewDetector(speech.DetectorConfig{
+		ModelPath:            filepath.Join(os.Getenv("ONNX_MODEL_FOLDER_PATH"), "silero_vad.onnx"),
+		SampleRate:           sampleRate,
+		Threshold:            0.5,
+		MinSilenceDurationMs: input.MinSilenceDuration,
+		SpeechPadMs:          input.SpeechPad,
+	})
+	if err != nil {
+		return fmt.Errorf("creating voice activity detector: %w", err)
+	}
+
+	defer func() {
+		if removeErr := sd.Destroy(); removeErr != nil {
+			if err == nil {
+				err = fmt.Errorf("destroy speech detector: %w", removeErr)
+			}
+		}
+	}()
+
+	segments, err := sd.Detect(audioBuf.AsFloat32Buffer().Data)
+	if err != nil {
+		return fmt.Errorf("detect voice activity: %w", err)
+	}
+
+	dao := detectActivityOutput{
+		Segments: make([]segmentData, len(segments)),
+	}
+	for i, s := range segments {
+		dao.Segments[i] = segmentData{StartTime: s.SpeechStartAt, EndTime: s.SpeechEndAt}
+	}
+
+	if err := job.Output.WriteData(ctx, dao); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func toMono(buffer *audio.IntBuffer) *audio.IntBuffer {
+	for i := 0; i < len(buffer.Data)/2; i++ {
+		buffer.Data[i] = (buffer.Data[2*i] + buffer.Data[2*i+1]) / 2
+	}
+	buffer.Data = buffer.Data[:len(buffer.Data)/2]
+	buffer.Format.NumChannels = 1
+	return buffer
+}
+
+func resampleAudio(input []float64, inputRate, outputRate float64, channels int) ([]int, error) {
+	var buf bytes.Buffer
+	resampler, err := resample.New(&buf, inputRate, outputRate, channels, resample.F64, resample.HighQ)
+	if err != nil {
+		return nil, fmt.Errorf("creating resampler: %w", err)
+	}
+	defer resampler.Close()
+
+	// Convert []float64 to []byte
+	inputBytes := make([]byte, len(input)*8)
+	for i, v := range input {
+		binary.LittleEndian.PutUint64(inputBytes[i*8:], math.Float64bits(v))
+	}
+
+	_, err = resampler.Write(inputBytes)
+	if err != nil {
+		return nil, fmt.Errorf("writing to resampler: %w", err)
+	}
+
+	// Convert resampled []byte back to []int
+	resampledBytes := buf.Bytes()
+	resampledData := make([]int, len(resampledBytes)/8)
+	for i := 0; i < len(resampledData); i++ {
+		resampledFloat := math.Float64frombits(binary.LittleEndian.Uint64(resampledBytes[i*8:]))
+		resampledData[i] = int(resampledFloat)
+	}
+
+	return resampledData, nil
+}
diff --git a/pkg/component/operator/audio/v0/task_detect_activity_nontag.go b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go
new file mode 100644
index 000000000..c20f2325b
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity_nontag.go
@@ -0,0 +1,15 @@
+//go:build !onnx
+// +build !onnx
+
+package audio
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+)
+
+func detectActivity(ctx context.Context, job *base.Job) error {
+	return fmt.Errorf("the Audio operator wasn't built with onnxruntime")
+}
diff --git a/pkg/component/operator/audio/v0/task_detect_activity_test.go b/pkg/component/operator/audio/v0/task_detect_activity_test.go
new file mode 100644
index 000000000..6b84d67e4
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_detect_activity_test.go
@@ -0,0 +1,159 @@
+//go:build onnx
+// +build onnx
+
+package audio
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"math"
+	"os"
+	"testing"
+
+	"github.com/go-audio/audio"
+	"github.com/google/go-cmp/cmp"
+
+	qt "github.com/frankban/quicktest"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+	"github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
+	"github.com/instill-ai/pipeline-backend/pkg/data"
+)
+
+func TestDetectActivity(t *testing.T) {
+	c := qt.New(t)
+
+	testCases := []struct {
+		name            string
+		audioFile       string
+		sampleRate      int
+		threshold       float64
+		silenceDuration int
+		speechPad       int
+		wantSegments    string
+		expectedError   string
+	}{
+		{
+			name:            "ok - detect voice activity (voice1)",
+			audioFile:       "testdata/voice1.wav",
+			sampleRate:      16000,
+			threshold:       0.5,
+			silenceDuration: 500,
+			speechPad:       100,
+			wantSegments:    "testdata/voice1-activity-segments.json",
+		},
+		{
+			name:            "ok - detect voice activity (voice2)",
+			audioFile:       "testdata/voice2.wav",
+			sampleRate:      16000,
+			threshold:       0.5,
+			silenceDuration: 500,
+			speechPad:       30,
+			wantSegments:    "testdata/voice2-activity-segments.json",
+		},
+	}
+
+	for _, tc := range testCases {
+		c.Run(tc.name, func(c *qt.C) {
+			component := Init(base.Component{})
+			c.Assert(component, qt.IsNotNil)
+
+			execution, err := component.CreateExecution(base.ComponentExecution{
+				Component: component,
+				Task:      taskDetectActivity,
+			})
+			c.Assert(err, qt.IsNil)
+			c.Assert(execution, qt.IsNotNil)
+
+			ir, ow, eh, job := mock.GenerateMockJob(c)
+
+			// Load audio data
+			audioFile, err := os.Open(tc.audioFile)
+			c.Assert(err, qt.IsNil)
+			defer audioFile.Close()
+			audioData, err := io.ReadAll(audioFile)
+			c.Assert(err, qt.IsNil)
+
+			ir.ReadDataMock.Set(func(ctx context.Context, input any) error {
+				switch input := input.(type) {
+				case *detectActivityInput:
+					audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav")
+					c.Assert(err, qt.IsNil)
+					*input = detectActivityInput{
+						Audio:              audio,
+						MinSilenceDuration: tc.silenceDuration,
+						SpeechPad:          tc.speechPad,
+					}
+				}
+				return nil
+			})
+
+			var capturedOutput detectActivityOutput
+			ow.WriteDataMock.Set(func(ctx context.Context, output any) error {
+				capturedOutput = output.(detectActivityOutput)
+				return nil
+			})
+
+			eh.ErrorMock.Set(func(ctx context.Context, err error) {
+				c.Assert(err, qt.ErrorMatches, tc.expectedError)
+			})
+
+			if tc.expectedError != "" {
+				ow.WriteDataMock.Optional()
+			} else {
+				eh.ErrorMock.Optional()
+			}
+
+			err = execution.Execute(context.Background(), []*base.Job{job})
+
+			if tc.expectedError == "" {
+				c.Assert(err, qt.IsNil)
+
+				// Load expected segments
+				expectedSegmentsJSONData, err := os.ReadFile(tc.wantSegments)
+				c.Assert(err, qt.IsNil)
+				var expectedSegmentsStruct struct {
+					Segments []segmentData `instill:"segments"`
+				}
+
+				var segmentsMap map[string]interface{}
+				err = json.Unmarshal(expectedSegmentsJSONData, &segmentsMap)
+				c.Assert(err, qt.IsNil)
+
+				jsonValue, err := data.NewJSONValue(segmentsMap)
+				c.Assert(err, qt.IsNil)
+
+				c.Assert(data.Unmarshal(jsonValue, &expectedSegmentsStruct), qt.IsNil)
+				expectedSegments := expectedSegmentsStruct.Segments
+
+				c.Assert(capturedOutput.Segments, qt.HasLen, len(expectedSegments))
+
+				for i, actual := range capturedOutput.Segments {
+					expected := expectedSegments[i]
+					c.Assert(actual.StartTime, floatEquals(0.1), expected.StartTime)
+					c.Assert(actual.EndTime, floatEquals(0.1), expected.EndTime)
+				}
+			}
+		})
+	}
+}
+
+// floatEquals is a custom checker for comparing float64 values with an epsilon
+func floatEquals(epsilon float64) qt.Checker {
+	return qt.CmpEquals(cmp.Comparer(func(x, y float64) bool {
+		return math.Abs(x-y) <= epsilon
+	}))
+}
+
+func TestToMono(t *testing.T) {
+	c := qt.New(t)
+
+	stereoBuffer := &audio.IntBuffer{
+		Data:   []int{1, 2, 3, 4, 5, 6},
+		Format: &audio.Format{NumChannels: 2},
+	}
+	monoBuffer := toMono(stereoBuffer)
+	c.Assert(monoBuffer.Format.NumChannels, qt.Equals, 1)
+	c.Assert(monoBuffer.Data, qt.DeepEquals, []int{1, 3, 5})
+}
diff --git a/pkg/component/operator/audio/v0/task_segment.go b/pkg/component/operator/audio/v0/task_segment.go
new file mode 100644
index 000000000..a7a54f605
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_segment.go
@@ -0,0 +1,107 @@
+package audio
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+
+	"github.com/go-audio/audio"
+	"github.com/go-audio/wav"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+	"github.com/instill-ai/pipeline-backend/pkg/data"
+	"github.com/instill-ai/pipeline-backend/pkg/data/format"
+)
+
+func segment(ctx context.Context, job *base.Job) error {
+
+	var input segmentInput
+	if err := job.Input.ReadData(ctx, &input); err != nil {
+		return err
+	}
+
+	audioBuf, dec, err := decodeAudioWAV(input.Audio)
+	if err != nil {
+		return err
+	}
+
+	output := segmentOutput{
+		AudioSegments: make([]format.Audio, len(input.Segments)),
+	}
+
+	for i, seg := range input.Segments {
+		seg, err := extractSegment(audioBuf, seg)
+		if err != nil {
+			return err
+		}
+		encSeg, err := encodeSegment(seg, audioBuf.Format, dec)
+		if err != nil {
+			return err
+		}
+		ad, err := data.NewAudioFromBytes(encSeg, "audio/wav", fmt.Sprintf("audio-segment-%d.wav", i))
+		if err != nil {
+			return err
+		}
+		output.AudioSegments[i] = ad
+	}
+
+	if err := job.Output.WriteData(ctx, output); err != nil {
+		return err
+	}
+
+	return nil
+
+}
+
+func extractSegment(audioBuf *audio.IntBuffer, seg segmentData) ([]int, error) {
+	startSample := int(seg.StartTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels))
+	endSample := int(seg.EndTime * float64(audioBuf.Format.SampleRate) * float64(audioBuf.Format.NumChannels))
+
+	if startSample < 0 {
+		startSample = 0
+	}
+	if endSample > len(audioBuf.Data) {
+		endSample = len(audioBuf.Data)
+	}
+
+	return audioBuf.Data[startSample:endSample], nil
+}
+
+func encodeSegment(segment []int, format *audio.Format, dec *wav.Decoder) ([]byte, error) {
+	// Use a temporary file instead of a buffer
+	tempFile, err := os.CreateTemp("", "audio_segment_*.wav")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create temp file: %w", err)
+	}
+	defer os.Remove(tempFile.Name()) // Clean up the temp file when we're done
+
+	encoder := wav.NewEncoder(tempFile, format.SampleRate, int(dec.BitDepth), format.NumChannels, int(dec.WavAudioFormat))
+
+	segmentBuf := &audio.IntBuffer{
+		Format:         format,
+		Data:           segment,
+		SourceBitDepth: int(dec.BitDepth),
+	}
+
+	if err := encoder.Write(segmentBuf); err != nil {
+		return nil, fmt.Errorf("failed to write segment to buffer: %w", err)
+	}
+
+	if err := encoder.Close(); err != nil {
+		return nil, fmt.Errorf("failed to close the encoder: %w", err)
+	}
+
+	// Read the contents of the temp file
+	_, err = tempFile.Seek(0, 0)
+	if err != nil {
+		return nil, fmt.Errorf("failed to seek to the beginning of temp file: %w", err)
+	}
+
+	fileContents, err := io.ReadAll(tempFile)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read temp file: %w", err)
+	}
+
+	return fileContents, nil
+}
diff --git a/pkg/component/operator/audio/v0/task_segment_test.go b/pkg/component/operator/audio/v0/task_segment_test.go
new file mode 100644
index 000000000..bc9a0221f
--- /dev/null
+++ b/pkg/component/operator/audio/v0/task_segment_test.go
@@ -0,0 +1,115 @@
+package audio
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"os"
+	"testing"
+
+	qt "github.com/frankban/quicktest"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+	"github.com/instill-ai/pipeline-backend/pkg/component/internal/mock"
+	"github.com/instill-ai/pipeline-backend/pkg/data"
+)
+
+func TestSegment(t *testing.T) {
+	c := qt.New(t)
+
+	testCases := []struct {
+		name          string
+		audioFile     string
+		segmentsFile  string
+		expectedCount int
+		expectedError string
+	}{
+		{
+			name:          "ok - valid segmentation",
+			audioFile:     "testdata/voice1.wav",
+			segmentsFile:  "testdata/voice1-activity-segments.json",
+			expectedCount: 5,
+		},
+	}
+
+	for _, tc := range testCases {
+		c.Run(tc.name, func(c *qt.C) {
+			component := Init(base.Component{})
+			c.Assert(component, qt.IsNotNil)
+
+			execution, err := component.CreateExecution(base.ComponentExecution{
+				Component: component,
+				Task:      taskSegment,
+			})
+			c.Assert(err, qt.IsNil)
+			c.Assert(execution, qt.IsNotNil)
+
+			ir, ow, eh, job := mock.GenerateMockJob(c)
+
+			// Load audio data
+			audioFile, err := os.Open(tc.audioFile)
+			c.Assert(err, qt.IsNil)
+			defer audioFile.Close()
+			audioData, err := io.ReadAll(audioFile)
+			c.Assert(err, qt.IsNil)
+
+			// Load segments data
+			segmentsJSONData, err := os.ReadFile(tc.segmentsFile)
+			c.Assert(err, qt.IsNil)
+			var segmentsStruct struct {
+				Segments []segmentData `instill:"segments"`
+			}
+
+			var segmentsMap map[string]interface{}
+			err = json.Unmarshal(segmentsJSONData, &segmentsMap)
+			c.Assert(err, qt.IsNil)
+
+			jsonValue, err := data.NewJSONValue(segmentsMap)
+			c.Assert(err, qt.IsNil)
+
+			c.Assert(data.Unmarshal(jsonValue, &segmentsStruct), qt.IsNil)
+			segments := segmentsStruct.Segments
+
+			ir.ReadDataMock.Set(func(ctx context.Context, input any) error {
+				switch input := input.(type) {
+				case *segmentInput:
+					audio, err := data.NewAudioFromBytes(audioData, "audio/wav", "input.wav")
+					c.Assert(err, qt.IsNil)
+					*input = segmentInput{
+						Audio:    audio,
+						Segments: segments,
+					}
+				}
+				return nil
+			})
+
+			var capturedOutput segmentOutput
+			ow.WriteDataMock.Set(func(ctx context.Context, output any) error {
+				capturedOutput = output.(segmentOutput)
+				return nil
+			})
+
+			eh.ErrorMock.Set(func(ctx context.Context, err error) {
+				c.Assert(err, qt.ErrorMatches, tc.expectedError)
+			})
+
+			if tc.expectedError != "" {
+				ow.WriteDataMock.Optional()
+			} else {
+				eh.ErrorMock.Optional()
+			}
+
+			err = execution.Execute(context.Background(), []*base.Job{job})
+
+			if tc.expectedError == "" {
+				c.Assert(err, qt.IsNil)
+				c.Assert(capturedOutput.AudioSegments, qt.HasLen, tc.expectedCount)
+
+				for i, segment := range capturedOutput.AudioSegments {
+					c.Assert(segment, qt.Not(qt.IsNil), qt.Commentf("Segment %d is nil", i))
+					c.Assert(segment.ContentType().String(), qt.Equals, "audio/ogg", qt.Commentf("Segment %d has incorrect MIME type", i))
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json
new file mode 100644
index 000000000..d72f2c729
--- /dev/null
+++ b/pkg/component/operator/audio/v0/testdata/voice1-activity-segments.json
@@ -0,0 +1,24 @@
+{
+  "segments": [
+    {
+      "start-time": 1.5,
+      "end-time": 2.404
+    },
+    {
+      "start-time": 3.196,
+      "end-time": 4.068
+    },
+    {
+      "start-time": 4.604,
+      "end-time": 5.764
+    },
+    {
+      "start-time": 6.62,
+      "end-time": 14.948
+    },
+    {
+      "start-time": 15.836,
+      "end-time": 18.564
+    }
+  ]
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice1.wav b/pkg/component/operator/audio/v0/testdata/voice1.wav
new file mode 100644
index 000000000..cfc16301e
Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice1.wav differ
diff --git a/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json
new file mode 100644
index 000000000..e6f7d5f57
--- /dev/null
+++ b/pkg/component/operator/audio/v0/testdata/voice2-activity-segments.json
@@ -0,0 +1,16 @@
+{
+  "segments": [
+    {
+      "start-time": 0.002,
+      "end-time": 9.406
+    },
+    {
+      "start-time": 10.146,
+      "end-time": 18.782
+    },
+    {
+      "start-time": 19.234,
+      "end-time": 30.878
+    }
+  ]
+}
diff --git a/pkg/component/operator/audio/v0/testdata/voice2.wav b/pkg/component/operator/audio/v0/testdata/voice2.wav
new file mode 100644
index 000000000..0d5dc63de
Binary files /dev/null and b/pkg/component/operator/audio/v0/testdata/voice2.wav differ
diff --git a/pkg/component/operator/document/v0/convert_test.go b/pkg/component/operator/document/v0/convert_test.go
index 10b5985d1..8afe651a2 100644
--- a/pkg/component/operator/document/v0/convert_test.go
+++ b/pkg/component/operator/document/v0/convert_test.go
@@ -28,22 +28,20 @@ func TestConvertToText(t *testing.T) {
 			expected: ConvertToTextOutput{
 				Body: "This is test file for markdown",
 				Meta: map[string]string{
-					"Custom Metadata": "no",
-					"Encrypted":       "no",
-					"File size":       "15489 bytes",
-					"Form":            "none",
-					"JavaScript":      "no",
-					"Metadata Stream": "no",
-					"Optimized":       "no",
-					"PDF version":     "1.4",
-					"Page rot":        "0",
-					"Page size":       "596 x 842 pts (A4)",
-					"Pages":           "1",
-					"Producer":        "Skia/PDF m128 Google Docs Renderer",
-					"Suspects":        "no",
-					"Tagged":          "no",
-					"Title":           "Untitled document",
-					"UserProperties":  "no",
+					"Encrypted":      "no",
+					"File size":      "15489 bytes",
+					"Form":           "none",
+					"JavaScript":     "no",
+					"Optimized":      "no",
+					"PDF version":    "1.4",
+					"Page rot":       "0",
+					"Page size":      "596 x 842 pts (A4)",
+					"Pages":          "1",
+					"Producer":       "Skia/PDF m128 Google Docs Renderer",
+					"Suspects":       "no",
+					"Tagged":         "no",
+					"Title":          "Untitled document",
+					"UserProperties": "no",
 				},
 				MSecs: 3,
 			},
diff --git a/pkg/component/resources/onnx/silero_vad.onnx b/pkg/component/resources/onnx/silero_vad.onnx
new file mode 100644
index 000000000..d0ccd9d7f
Binary files /dev/null and b/pkg/component/resources/onnx/silero_vad.onnx differ