Skip to content

Commit

Permalink
Feat : version 1
Browse files Browse the repository at this point in the history
  • Loading branch information
jouhy authored and soyoonjeong committed Mar 27, 2024
1 parent e167a68 commit 855ffb0
Show file tree
Hide file tree
Showing 235 changed files with 42,662 additions and 0 deletions.
1 change: 1 addition & 0 deletions mlops/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# voice2face-mlops
14 changes: 14 additions & 0 deletions mlops/docker/mlflow/DockerFile.mlflow
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM amd64/python:3.9-slim

RUN apt-get update && apt-get install -y \
git \
wget \
&& rm -rf /var/lib/apt/lists/*

RUN pip install -U pip &&\
pip install boto3==1.26.8 mlflow==1.30.0 psycopg2-binary

RUN cd /tmp && \
wget https://dl.min.io/client/mc/release/linux-amd64/mc && \
chmod +x mc && \
mv mc /usr/bin/mc
58 changes: 58 additions & 0 deletions mlops/docker/mlflow/docker-compose_mlflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
version: "3"

services:
mlflow-backend-store:
image: postgres:14.0
container_name: mlflow-backend-store
environment:
POSTGRES_USER: mlflowuser
POSTGRES_PASSWORD: mlflowpassword
POSTGRES_DB: mlflowdatabase
healthcheck:
test: ["CMD", "pg_isready", "-q", "-U", "mlflowuser", "-d", "mlflowdatabase"]
interval: 10s
timeout: 5s
retries: 5

mlflow-artifact-store:
image: minio/minio:RELEASE.2024-01-18T22-51-28Z
container_name: mlflow-artifact-store
ports:
- 9000:9000
- 9001:9001
environment:
MINIO_ROOT_USER: # write username
MINIO_ROOT_PASSWORD: # write password
command: server /data/minio --console-address :9001
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5

mlflow-server:
build:
context: .
dockerfile: DockerFile_mlflow
container_name: mlflow-server
depends_on:
mlflow-backend-store:
condition: service_healthy
mlflow-artifact-store:
condition: service_healthy
ports:
- 5001:5000
environment:
AWS_ACCESS_KEY: AKIA3FLD32HPRN22NJQ7
AWS_SECRET_ACCESS_KEY: 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
MLFLOW_S3_ENDPOINT_URL: http://mlflow-artifact-store:9000
command:
- /bin/sh
- -c
- |
mc config host add mlflowminio http://mlflow-artifact-store:9000 minio miniostorage &&
mc mb --ignore-existing mlflowminio/mlflow
mlflow server \
--backend-store-uri postgresql://mlflowuser:mlflowpassword@mlflow-backend-store/mlflowdatabase \
--default-artifact-root s3://mlflow/ \
--host 0.0.0.0
35 changes: 35 additions & 0 deletions mlops/docker/monitoring/alertmanager/config/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
global:
resolve_timeout: 1m

route:
group_by: ['alertname', 'instance']
group_wait: 10s
group_interval: 5m
repeat_interval: 1h
receiver: 'slack-notifications'
routes:
- match:
severity: 'critical'
receiver: 'slack-notifications'

receivers:
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://discord.com/api/webhooks/1222157548657049611/TAhDV5DnL1sAVNBYJivf3CYe7877PKoBSsp0QZ9DgEMaVNaslR6wlBZuaSmk6NiQZ7zZ' # Discord webhook URL을 여기에 입력하세요.
channel: '#alerts'
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Alert - {{ .CommonLabels.alertname }} for {{ .CommonLabels.instance }}'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}\n
*Description:* {{ .Annotations.description }}\n
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* {{ .Value }}\n{{ end }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
47 changes: 47 additions & 0 deletions mlops/docker/monitoring/docker-compose_monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
version: '3.7'

services:
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- ./prometheus/config/:/etc/prometheus/
- ./prometheus/prometheus-volume:/prometheus
ports:
- 9090:9090
command:
- "--web.enable-lifecycle"
- '--config.file=/etc/prometheus/prometheus.yml'
restart: always
networks:
- promnet
user: root

grafana:
image: grafana/grafana
container_name: grafana
volumes:
- ./grafana-volume:/var/lib/grafana
restart: always
networks:
- promnet
ports:
- 3300:3000
user: root
alertmanager:
image: prom/alertmanager
container_name: alertmanager
user: root
ports:
- 9093:9093
volumes:
- ./alertmanager/config/:/etc/alertmanager/
networks:
- promnet
restart: always
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'

networks:
promnet:
driver: bridge
26 changes: 26 additions & 0 deletions mlops/docker/monitoring/docker-compose_node_exporter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
version: '3.7'
services:
node:
image: prom/node-exporter
container_name: node-exporter
ports:
- 9100:9100
networks:
- promnet
dcgm:
image : nvcr.io/nvidia/k8s/dcgm-exporter:3.2.6-3.1.9-ubuntu20.04
container_name : dcgm-exporter
ports:
- 9400:9400
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- promnet
networks:
promnet:
driver: bridge
55 changes: 55 additions & 0 deletions mlops/docker/monitoring/prometheus/config/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음
global:
scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m
scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s
evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m

# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨
# query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함

# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다.
rule_files:
- "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치

# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다.

scrape_configs:
- job_name: 'inference_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.193.25:9100']
- job_name: 'web_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.194.59:9100']
- job_name: 'minio_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['223.130.133.236:9100']
- job_name: 'gpu_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.193.25:9400']

# 그 외에도 authorization 설정
# service discovery 설정(sd)

# 실제 scrap 하는 타겟에 관한 설정
# static_configs:
# - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor
# labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨
# service : 'monitor-1'

# relabel_config - 스크랩되기 전의 label들을 수정
# metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop)


# # Alerting specifies settings related to the Alertmanager.
# alerting:
# alert_relabel_configs:
# [ - <relabel_config> ... ]
# alertmanagers:
# [ - <alertmanager_config> ... ]
21 changes: 21 additions & 0 deletions mlops/docker/monitoring/prometheus/config/rule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
groups:
- name: example # 파일 내에서 unique 해야함
rules:

# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
23 changes: 23 additions & 0 deletions mlops/docker/pipeline/Dockerfile.sf2f
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-runtime

ENV base_path /workspace/
COPY . ${base_path}

RUN apt update -y
RUN apt install -y gcc

RUN pip install glog \
python_speech_features \
webrtcvad \
pydub \
mlflow \
minio \
boto3 \
flask \
flask_cors \
gunicorn \
python-dotenv

EXPOSE 3002

# CMD [ "python", "inference.py" ]
37 changes: 37 additions & 0 deletions mlops/docker/pipeline/Dockerfile.swimswap
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-runtime

ENV base_path /workspace
COPY . ${base_path}

RUN mkdir -p ${base_path}/insightface_func/models \
${base_path}/parsing_model/checkpoint \
${base_path}/arcface_model

RUN apt update -y
RUN apt install -y libgl1-mesa-glx \
libglib2.0-0 \
unzip

ADD https://storage.makezenerator.com:9000/voice2face-public/model_resource/swimswap/79999_iter.pth ${base_path}/parsing_model/checkpoint

ADD https://storage.makezenerator.com:9000/voice2face-public/model_resource/swimswap/antelope.zip ${base_path}/insightface_func/models

RUN unzip ${base_path}/insightface_func/models/antelope.zip -d ${base_path}/insightface_func/models/


RUN pip install imageio \
imageio-ffmpeg \
insightface==0.2.1 \
onnxruntime \
onnxruntime-gpu \
mlflow \
minio \
boto3 \
flask \
flask_cors \
gunicorn \
python-dotenv


# ENTRYPOINT gunicorn -w 1 -b 0.0.0.0:3001 app:app --reload --timeout 90
# CMD [ "python", "inference.py" ]
46 changes: 46 additions & 0 deletions mlops/docker/pipeline/docker-compose_serving.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
version: "3"

services:
sf2f:
build:
context: ./sf2f/
dockerfile: Dockerfile.sf2f
container_name: sf2f-server
# ports:
# - 3002:3002
env_file:
.env.serving
command: gunicorn -w 4 -b 0.0.0.0:3002 app:app --reload --timeout 2000
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# networks:
# - inference
network_mode: "host"
swimswap:
build:
context: ./SwimSwap/
dockerfile: Dockerfile.swimswap
container_name: swimswap-server
# ports:
# - 3001:3001
env_file:
.env.serving
command: gunicorn -w 4 -b 0.0.0.0:3001 app:app --timeout 2700
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
network_mode: "host"
# networks:
# - inference
# networks:
# inference:
# driver: host
2 changes: 2 additions & 0 deletions mlops/mlflow/registry/Swimswap/checkpoints/people/iter.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
519
4062
Loading

0 comments on commit 855ffb0

Please sign in to comment.