Feat : version 1

carboxaminoo · Mar 27, 2024 · 855ffb0 · 855ffb0
1 parent e167a68
commit 855ffb0
Show file tree

Hide file tree

Showing 235 changed files with 42,662 additions and 0 deletions.
diff --git a/mlops/README.md b/mlops/README.md
@@ -0,0 +1 @@
+# voice2face-mlops
diff --git a/mlops/docker/mlflow/DockerFile.mlflow b/mlops/docker/mlflow/DockerFile.mlflow
@@ -0,0 +1,14 @@
+FROM amd64/python:3.9-slim
+
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install -U pip &&\
+    pip install boto3==1.26.8 mlflow==1.30.0 psycopg2-binary
+
+RUN cd /tmp && \
+    wget https://dl.min.io/client/mc/release/linux-amd64/mc && \
+    chmod +x mc && \
+    mv mc /usr/bin/mc
diff --git a/mlops/docker/mlflow/docker-compose_mlflow.yaml b/mlops/docker/mlflow/docker-compose_mlflow.yaml
@@ -0,0 +1,58 @@
+version: "3"
+
+services:
+  mlflow-backend-store:
+    image: postgres:14.0
+    container_name: mlflow-backend-store
+    environment:
+      POSTGRES_USER: mlflowuser
+      POSTGRES_PASSWORD: mlflowpassword
+      POSTGRES_DB: mlflowdatabase
+    healthcheck:
+      test: ["CMD", "pg_isready", "-q", "-U", "mlflowuser", "-d", "mlflowdatabase"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  mlflow-artifact-store:
+    image: minio/minio:RELEASE.2024-01-18T22-51-28Z
+    container_name: mlflow-artifact-store
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      MINIO_ROOT_USER: # write username
+      MINIO_ROOT_PASSWORD: # write password
+    command: server /data/minio --console-address :9001
+    healthcheck:
+      test: ["CMD", "mc", "ready", "local"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
+  mlflow-server:
+    build:
+      context: .
+      dockerfile: DockerFile_mlflow
+    container_name: mlflow-server
+    depends_on:
+      mlflow-backend-store:
+        condition: service_healthy
+      mlflow-artifact-store:
+        condition: service_healthy
+    ports:
+      - 5001:5000
+    environment:
+      AWS_ACCESS_KEY: AKIA3FLD32HPRN22NJQ7
+      AWS_SECRET_ACCESS_KEY: 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
+      MLFLOW_S3_ENDPOINT_URL: http://mlflow-artifact-store:9000
+    command:
+      - /bin/sh
+      - -c
+      - |
+        mc config host add mlflowminio http://mlflow-artifact-store:9000 minio miniostorage &&
+        mc mb --ignore-existing mlflowminio/mlflow
+        mlflow server \
+        --backend-store-uri postgresql://mlflowuser:mlflowpassword@mlflow-backend-store/mlflowdatabase \
+        --default-artifact-root s3://mlflow/ \
+        --host 0.0.0.0
diff --git a/mlops/docker/monitoring/alertmanager/config/alertmanager.yml b/mlops/docker/monitoring/alertmanager/config/alertmanager.yml
@@ -0,0 +1,35 @@
+global:
+  resolve_timeout: 1m
+
+route:
+  group_by: ['alertname', 'instance']
+  group_wait: 10s
+  group_interval: 5m
+  repeat_interval: 1h
+  receiver: 'slack-notifications'
+  routes:
+    - match:
+        severity: 'critical'
+      receiver: 'slack-notifications'
+
+receivers:
+- name: 'slack-notifications'
+  slack_configs:
+  - api_url: 'https://discord.com/api/webhooks/1222157548657049611/TAhDV5DnL1sAVNBYJivf3CYe7877PKoBSsp0QZ9DgEMaVNaslR6wlBZuaSmk6NiQZ7zZ' # Discord webhook URL을 여기에 입력하세요.
+    channel: '#alerts'
+    send_resolved: true
+    title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Alert - {{ .CommonLabels.alertname }} for {{ .CommonLabels.instance }}'
+    text: >-
+      {{ range .Alerts }}
+        *Alert:* {{ .Annotations.summary }}\n
+        *Description:* {{ .Annotations.description }}\n
+        *Details:*
+        {{ range .Labels.SortedPairs }} • *{{ .Name }}:* {{ .Value }}\n{{ end }}
+      {{ end }}
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'instance']
diff --git a/mlops/docker/monitoring/docker-compose_monitoring.yaml b/mlops/docker/monitoring/docker-compose_monitoring.yaml
@@ -0,0 +1,47 @@
+version: '3.7'
+
+services:
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    volumes:
+      - ./prometheus/config/:/etc/prometheus/
+      - ./prometheus/prometheus-volume:/prometheus
+    ports:
+      - 9090:9090
+    command:
+      - "--web.enable-lifecycle"
+      - '--config.file=/etc/prometheus/prometheus.yml'
+    restart: always
+    networks:
+      - promnet
+    user: root
+
+  grafana:
+    image: grafana/grafana
+    container_name: grafana
+    volumes:
+      - ./grafana-volume:/var/lib/grafana
+    restart: always
+    networks:
+      - promnet
+    ports:
+      - 3300:3000
+    user: root
+  alertmanager:
+    image: prom/alertmanager
+    container_name: alertmanager
+    user: root
+    ports: 
+      - 9093:9093
+    volumes:
+      - ./alertmanager/config/:/etc/alertmanager/
+    networks:
+      - promnet
+    restart: always
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+
+networks:
+  promnet:
+    driver: bridge
diff --git a/mlops/docker/monitoring/docker-compose_node_exporter.yaml b/mlops/docker/monitoring/docker-compose_node_exporter.yaml
@@ -0,0 +1,26 @@
+version: '3.7'
+services:
+  node:
+   image: prom/node-exporter
+   container_name: node-exporter
+   ports:
+     - 9100:9100
+   networks:
+     - promnet
+  dcgm:
+    image : nvcr.io/nvidia/k8s/dcgm-exporter:3.2.6-3.1.9-ubuntu20.04
+    container_name : dcgm-exporter
+    ports:
+      - 9400:9400
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: 1
+            capabilities: [gpu]
+    networks:
+      - promnet
+networks:
+  promnet:
+    driver: bridge
diff --git a/mlops/docker/monitoring/prometheus/config/prometheus.yml b/mlops/docker/monitoring/prometheus/config/prometheus.yml
@@ -0,0 +1,55 @@
+# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음
+global:
+  scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m
+  scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s
+  evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+    monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨
+  # query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함
+
+# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다.
+rule_files:
+  - "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치
+
+# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다.
+
+scrape_configs:
+  - job_name: 'inference_node_exporter'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['175.45.193.25:9100']
+  - job_name: 'web_node_exporter'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['175.45.194.59:9100']
+  - job_name: 'minio_node_exporter'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['223.130.133.236:9100']
+  - job_name: 'gpu_exporter'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['175.45.193.25:9400']
+
+    # 그 외에도 authorization 설정 
+    # service discovery 설정(sd)
+
+    # 실제 scrap 하는 타겟에 관한 설정
+    # static_configs:
+    #   - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor  
+    #     labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨
+    #       service : 'monitor-1'
+
+    # relabel_config - 스크랩되기 전의 label들을 수정
+    # metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop)
+
+
+# # Alerting specifies settings related to the Alertmanager.
+# alerting:
+#   alert_relabel_configs:
+#     [ - <relabel_config> ... ]
+#   alertmanagers:
+#     [ - <alertmanager_config> ... ]
diff --git a/mlops/docker/monitoring/prometheus/config/rule.yml b/mlops/docker/monitoring/prometheus/config/rule.yml
@@ -0,0 +1,21 @@
+groups:
+- name: example # 파일 내에서 unique 해야함
+  rules:
+
+  # Alert for any instance that is unreachable for >5 minutes.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
+
+  # Alert for any instance that has a median request latency >1s.
+  - alert: APIHighRequestLatency
+    expr: api_http_request_latencies_second{quantile="0.5"} > 1
+    for: 10m
+    annotations:
+      summary: "High request latency on {{ $labels.instance }}"
+      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
diff --git a/mlops/docker/pipeline/Dockerfile.sf2f b/mlops/docker/pipeline/Dockerfile.sf2f
@@ -0,0 +1,23 @@
+FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-runtime
+
+ENV base_path /workspace/
+COPY . ${base_path}
+
+RUN apt update -y
+RUN apt install -y gcc
+
+RUN pip install glog \
+python_speech_features \
+webrtcvad \
+pydub \
+mlflow \
+minio \
+boto3 \
+flask \
+flask_cors \
+gunicorn \
+python-dotenv
+
+EXPOSE 3002
+
+# CMD [ "python", "inference.py" ]
diff --git a/mlops/docker/pipeline/Dockerfile.swimswap b/mlops/docker/pipeline/Dockerfile.swimswap
@@ -0,0 +1,37 @@
+FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-runtime
+
+ENV base_path /workspace
+COPY . ${base_path}
+
+RUN mkdir -p ${base_path}/insightface_func/models \
+${base_path}/parsing_model/checkpoint \
+${base_path}/arcface_model
+
+RUN apt update -y
+RUN apt install -y libgl1-mesa-glx \
+libglib2.0-0 \
+unzip
+
+ADD https://storage.makezenerator.com:9000/voice2face-public/model_resource/swimswap/79999_iter.pth ${base_path}/parsing_model/checkpoint
+
+ADD https://storage.makezenerator.com:9000/voice2face-public/model_resource/swimswap/antelope.zip ${base_path}/insightface_func/models
+
+RUN unzip ${base_path}/insightface_func/models/antelope.zip -d ${base_path}/insightface_func/models/
+
+
+RUN pip install imageio \
+imageio-ffmpeg \
+insightface==0.2.1 \
+onnxruntime \
+onnxruntime-gpu \
+mlflow \
+minio \
+boto3 \
+flask \
+flask_cors \
+gunicorn \
+python-dotenv
+
+
+# ENTRYPOINT gunicorn -w 1 -b 0.0.0.0:3001 app:app --reload --timeout 90
+# CMD [ "python", "inference.py" ]
diff --git a/mlops/docker/pipeline/docker-compose_serving.yaml b/mlops/docker/pipeline/docker-compose_serving.yaml
@@ -0,0 +1,46 @@
+version: "3"
+
+services:
+  sf2f:
+    build:
+      context: ./sf2f/
+      dockerfile: Dockerfile.sf2f
+    container_name: sf2f-server
+    # ports:
+    #   - 3002:3002
+    env_file:
+      .env.serving
+    command: gunicorn -w 4 -b 0.0.0.0:3002 app:app --reload --timeout 2000
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: 1
+            capabilities: [gpu]
+    # networks:
+    #   - inference
+    network_mode: "host"
+  swimswap:
+    build:
+      context: ./SwimSwap/
+      dockerfile: Dockerfile.swimswap
+    container_name: swimswap-server
+    # ports:
+    #   - 3001:3001
+    env_file:
+      .env.serving
+    command: gunicorn -w 4 -b 0.0.0.0:3001 app:app --timeout 2700
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: 1
+            capabilities: [gpu]
+    network_mode: "host"
+#     networks:
+#       - inference
+# networks:
+#   inference:
+#     driver: host
diff --git a/mlops/mlflow/registry/Swimswap/checkpoints/people/iter.txt b/mlops/mlflow/registry/Swimswap/checkpoints/people/iter.txt
@@ -0,0 +1,2 @@
+519
+4062