Skip to content

Commit

Permalink
feat(component,audio): add TASK_DETECT_ACTIVITY and TASK_SEGMENT (#762)
Browse files Browse the repository at this point in the history
Because

- Voice activity detection (VAD) is an essential feature for audio and
video segmentation, enabling further text transcription for RAG
embedding and LLM understanding.

This commit

- adds `TASK_DETECT_ACTIVITY`
- adds `TASK_SEGMENT`
- refactors the previous Audio operator
  - removing `TASK_CHUNK_AUDIOS`
   - refactoring `TASK_SLICE_AUDIO` to `TASK_SEGMENT`
- makes the production image use base image `debian:bullseye-slim`
because `onnxruntime` doesn't support Alpine apk and we probably don't
wanna build it from scratch
  • Loading branch information
pinglin authored Nov 3, 2024
1 parent 2368f76 commit 9e92a31
Show file tree
Hide file tree
Showing 30 changed files with 1,080 additions and 425 deletions.
6 changes: 6 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ PUBLIC_SERVICE_PORT=8081
DOCKER_BUILDKIT=1
COMPOSE_DOCKER_CLI_BUILD=1

# ONNX_MODEL_FOLDER_PATH specifies the directory where ONNX models are stored.
# These models are loaded dynamically at runtime. The path is set relative to
# the project root, allowing for consistent model loading across different
# deployment environments.
ONNX_MODEL_FOLDER_PATH=${PWD}/pkg/component/resources/onnx

# test

# TEST_DBHOST and TEST_DBNAME are used to initialize a separate database for
Expand Down
24 changes: 19 additions & 5 deletions .github/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,9 @@ will create and migrate a test database to keep these queries isolated from the
main DB. You can set the database host and name by overriding the `TEST_DBHOST`
and `TEST_DBNAME` values.

Certain tests depend on the [`docconv`](https://github.com/sajari/docconv)
package and aren't run by default. You can trigger them by adding the `OCR=true`
flag to the coverage command. Make sure to install the [package
dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies)
first.
Certain tests depend on external packages and aren't run by default:
- For [`docconv`](https://github.com/sajari/docconv) tests, add `OCR=true` flag and install its [dependencies](https://github.com/sajari/docconv?tab=readme-ov-file#dependencies).
- For [`onnxruntime`](https://github.com/microsoft/onnxruntime) tests, add `ONNX=true` flag. Follow the [guideline](#set-up-onnx-runtime) to set up ONNX Runtime (Linux only).

#### Run the integration tests

Expand All @@ -111,6 +109,22 @@ If empty, tests will try to connect to `localhost:5432`.
$ make rm
```

### Set up ONNX Runtime (Linux only)

1. Download the latest [ONNX Runtime release](https://github.com/microsoft/onnxruntime/releases) for your system.

2. Install ONNX Runtime:
```bash
sudo mkdir -p /usr/local/onnxruntime
sudo tar -xzf onnxruntime-*-*-*.tgz -C /usr/local/onnxruntime --strip-components=1
export ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
export LD_RUN_PATH=$ONNXRUNTIME_ROOT_PATH/lib
export LIBRARY_PATH=$ONNXRUNTIME_ROOT_PATH/lib
export C_INCLUDE_PATH=$ONNXRUNTIME_ROOT_PATH/include
```

**Note:** If you don't have sudo access, extract to a user-writeable location (e.g., `~/onnxruntime`), set `ONNXRUNTIME_ROOT_PATH` accordingly, and adjust the environment variables as shown above. No need to create symlinks in this case.
## Codebase contribution
### Pre-commit hooks
Expand Down
15 changes: 14 additions & 1 deletion .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,23 @@ jobs:

- uses: actions/checkout@v3

- name: Install onnxruntime library and headers
run: |
export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
- name: Generate coverage report
run: |
make build-dev
make coverage DBTEST=true OCR=true
make coverage DBTEST=true OCR=true ONNX=true
- name: Upload coverage report
uses: codecov/codecov-action@v2
Expand Down
20 changes: 18 additions & 2 deletions .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,24 @@ jobs:
with:
go-version: ${{ env.GOLANG_VERSION }}
cache: false
- name: Install sorx
run: |
sudo apt-get update
sudo apt-get install -y libsoxr-dev
- name: Install onnxruntime library and headers
run: |
export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
version: v1.59
args: --timeout=10m
version: v1.61
args: --timeout=10m --build-tags onnx
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ repos:
rev: v0.5.1
hooks:
- id: golangci-lint
args: ["--build-tags", "onnx"]
- id: go-mod-tidy
- repo: https://github.com/pinglin/conventional-pre-commit
rev: v1.1.0
Expand Down
97 changes: 52 additions & 45 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,59 +1,62 @@
ARG GOLANG_VERSION=1.22.5
FROM golang:${GOLANG_VERSION}-alpine3.19 AS build

RUN apk add --no-cache build-base leptonica-dev tesseract-ocr-dev musl-dev
FROM golang:${GOLANG_VERSION}-bullseye AS build

ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION

RUN apt-get update && apt-get install -y \
build-essential \
libleptonica-dev \
libtesseract-dev \
libsoxr-dev \
&& rm -rf /var/lib/apt/lists/*

# Install ONNX Runtime (latest release)
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
RUN apt update && \
apt install -y wget jq && \
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
apt remove -y wget jq && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*

# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib

WORKDIR /src

COPY go.mod go.sum ./
RUN go mod download
COPY . .

RUN go get github.com/otiai10/gosseract/v2

ARG SERVICE_NAME TARGETOS TARGETARCH
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=ocr,onnx -o /${SERVICE_NAME} ./cmd/main
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-migrate ./cmd/migration
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init ./cmd/init

FROM debian:bullseye-slim

RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME} ./cmd/main
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 go build -tags=ocr,musl -o /${SERVICE_NAME}-worker ./cmd/worker
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-migrate ./cmd/migration
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=musl -o /${SERVICE_NAME}-init ./cmd/init

FROM alpine:3.19

RUN apk add --no-cache \
curl \
poppler-utils \
wv \
tidyhtml \
libc6-compat \
tesseract-ocr \
python3 \
py3-pip \
build-base \
python3-dev \
libffi-dev \
libreoffice \
qpdf \
msttcorefonts-installer \
font-noto \
font-noto-cjk \
ffmpeg \
chromium \
&& update-ms-fonts \
&& fc-cache -f \
&& python3 -m venv /opt/venv \
&& /opt/venv/bin/pip install --upgrade pip \
&& /opt/venv/bin/pip install pdfplumber tokenizers \
&& rm -rf /var/cache/apk/* /var/cache/fontconfig/*

# Download tesseract data
RUN curl -L https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata \
-o /usr/share/tessdata/eng.traineddata

ARG TARGETARCH
ARG BUILDARCH
RUN apk add unrtf --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community
# Install Python, create virtual environment, and install pdfplumber
RUN apt update && \
apt install -y curl python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
rm -rf /var/lib/apt/lists/*

# copy ONNX runtime from build stage
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_ROOT_PATH}

# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/

USER nobody:nogroup

Expand All @@ -71,3 +74,7 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-migrate ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./

# Set up ONNX model and environment variable
COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
35 changes: 30 additions & 5 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG GOLANG_VERSION=1.22.5
FROM golang:${GOLANG_VERSION}
FROM golang:${GOLANG_VERSION}-bullseye

ARG SERVICE_NAME

Expand All @@ -11,18 +11,39 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION

# Install Python, create virtual environment, and install pdfplumber
RUN apt update && \
apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg chromium qpdf && \
apt install -y python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice ffmpeg libsoxr-dev chromium qpdf && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
rm -rf /var/lib/apt/lists/*

# Install ONNX Runtime (latest release)
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
RUN apt update && \
apt install -y wget jq && \
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
apt remove -y wget jq && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*

# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib

# tparse
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/mfridman/tparse@latest

# air
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install github.com/cosmtrek/[email protected]

# k6
RUN go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
RUN go install github.com/mfridman/[email protected]
RUN xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go install go.k6.io/xk6/cmd/xk6@v${XK6_VERSION}
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH xk6 build v${K6_VERSION} --with github.com/grafana/xk6-sql --output /usr/bin/k6

# -- set up Go

Expand All @@ -38,6 +59,10 @@ ENV GOENV=/go/.config/go/env
# required to restore compatibility with those versions.
ENV GODEBUG=tlsrsakex=1

# Set up ONNX model and environment variable
COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx

USER nobody:nogroup

ENTRYPOINT ["tail", "-f", "/dev/null"]
65 changes: 25 additions & 40 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,6 @@ include .env
export

GOTEST_FLAGS := CFG_DATABASE_HOST=${TEST_DBHOST} CFG_DATABASE_NAME=${TEST_DBNAME}
ifeq (${DBTEST}, true)
GOTEST_TAGS := -tags=dbtest
endif
ifeq (${OCR}, true)
GOTEST_TAGS := -tags=ocr
endif


#============================================================================

Expand All @@ -40,10 +33,10 @@ latest: ## Run latest container
echo "Run latest container ${SERVICE_NAME} and ${SERVICE_NAME}-worker. To stop it, run \"make stop\"."
@docker run --network=instill-network \
--name ${SERVICE_NAME} \
-d ${SERVICE_NAME}:latest ./${SERVICE_NAME}
-d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}
@docker run --network=instill-network \
--name ${SERVICE_NAME}-worker \
-d ${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker
-d instill/${SERVICE_NAME}:latest ./${SERVICE_NAME}-worker

.PHONY: rm
rm: ## Remove all running containers
Expand All @@ -60,10 +53,10 @@ build-dev: ## Build dev docker image

.PHONY: build-latest
build-latest: ## Build latest docker image
@docker buildx build \
@docker build \
--build-arg GOLANG_VERSION=${GOLANG_VERSION} \
--build-arg SERVICE_NAME=${SERVICE_NAME} \
-t pipeline-backend:latest .
-t instill/pipeline-backend:latest .

.PHONY: go-gen
go-gen: ## Generate codes
Expand Down Expand Up @@ -94,40 +87,32 @@ coverage:
rm coverage.out; \
fi

# Tests should run in container without local tparse installation.
# If you encounter container test issues, install tparse locally:
# go install github.com/mfridman/tparse/cmd/tparse@latest
.PHONY: test
test:
# Ideally, it should be ok to run without installing tparse locally.
# However, there may be some issues that arise from running the tests
# in the container. If you encounter any issues, please install tparse
# locally via `go install github.com/mfridman/tparse/cmd/tparse@latest`
# and run the tests locally.
@if [ "${OCR}" = "true" ]; then \
docker run --rm \
-v $(PWD):/${SERVICE_NAME} \
--user $(id -u):$(id -g) \
--entrypoint= \
instill/${SERVICE_NAME}:dev \
make test-ocr; \
@TAGS=""; \
if [ "$${OCR}" = "true" ]; then \
TAGS="$$TAGS,ocr"; \
[ "$$(uname)" = "Darwin" ] && export TESSDATA_PREFIX=$$(dirname $$(brew list tesseract | grep share/tessdata/eng.traineddata)); \
fi; \
if [ "$${ONNX}" = "true" ]; then \
if [ "$$(uname)" = "Darwin" ]; then \
echo "ONNX Runtime test is not supported on Darwin (macOS)."; \
else \
TAGS="$$TAGS,onnx"; \
fi; \
fi; \
TAGS=$${TAGS#,}; \
if [ -n "$$TAGS" ]; then \
echo "Running tests with tags: $$TAGS"; \
go test -v -tags="$$TAGS" ./... -json | tparse --notests --all; \
else \
docker run --rm \
-v $(PWD):/${SERVICE_NAME} \
--user $(id -u):$(id -g) \
--entrypoint= \
instill/${SERVICE_NAME}:dev \
go test -v ./... -json | tparse --notests --all; \
echo "Running standard tests"; \
go test -v ./... -json | tparse --notests --all; \
fi

.PHONY: test-ocr
test-ocr:
# Certain component tests require additional dependencies.
# Install tesseract via `brew install tesseract`
# Setup `export LIBRARY_PATH="/opt/homebrew/lib"` `export CPATH="/opt/homebrew/include"`
ifeq ($(shell uname), Darwin)
@TESSDATA_PREFIX=$(shell dirname $(shell brew list tesseract | grep share/tessdata/eng.traineddata)) ${GOTEST_FLAGS} go test -v ./... -json | tparse --notests --all
else
@echo "This target can only be executed on Darwin (macOS)."
endif

.PHONY: integration-test
integration-test: ## Run integration test
@ # DB_HOST points to localhost by default. Override this variable if
Expand Down
Loading

0 comments on commit 9e92a31

Please sign in to comment.