Skip to content

Commit

Permalink
Merge pull request #448 from atheo89/sync-k-main
Browse files Browse the repository at this point in the history
Sync rhoai:main from odh:main
  • Loading branch information
atheo89 authored Nov 28, 2024
2 parents aed66a4 + ed770de commit c8bf0bd
Show file tree
Hide file tree
Showing 128 changed files with 14,624 additions and 13,851 deletions.
303 changes: 262 additions & 41 deletions .github/workflows/build-notebooks-TEMPLATE.yaml

Large diffs are not rendered by default.

26 changes: 23 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# https://tech.davis-hansson.com/p/make/
SHELL := bash
# todo: do not set .ONESHELL: for now
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
#.SHELLFLAGS := -eu -o pipefail -c
.DELETE_ON_ERROR:
MAKEFLAGS += --warn-undefined-variables
MAKEFLAGS += --no-builtin-rules

# todo: leave the default recipe prefix for now
ifeq ($(origin .RECIPEPREFIX), undefined)
$(error This Make does not support .RECIPEPREFIX. Please use GNU Make 4.0 or later)
endif
.RECIPEPREFIX =

IMAGE_REGISTRY ?= quay.io/opendatahub/workbench-images
RELEASE ?= 2024b
# additional user-specified caching parameters for $(CONTAINER_ENGINE) build
Expand All @@ -16,6 +31,9 @@ else
WHERE_WHICH ?= which
endif

# linux/amd64 or darwin/arm64
OS_ARCH=$(shell go env GOOS)/$(shell go env GOARCH)

IMAGE_TAG ?= $(RELEASE)_$(DATE)
KUBECTL_BIN ?= bin/kubectl
KUBECTL_VERSION ?= v1.23.11
Expand Down Expand Up @@ -392,7 +410,7 @@ rocm-runtime-tensorflow-ubi9-python-3.11: rocm-ubi9-python-3.11
bin/kubectl:
ifeq (,$(wildcard $(KUBECTL_BIN)))
@mkdir -p bin
@curl -sSL https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/linux/amd64/kubectl > \
@curl -sSL https://dl.k8s.io/release/$(KUBECTL_VERSION)/bin/$(OS_ARCH)/kubectl > \
$(KUBECTL_BIN)
@chmod +x $(KUBECTL_BIN)
endif
Expand Down Expand Up @@ -508,6 +526,7 @@ validate-runtime-image: bin/kubectl
$(info # Running tests for $(NOTEBOOK_NAME) runtime...)
$(KUBECTL_BIN) wait --for=condition=ready pod runtime-pod --timeout=300s
@required_commands=$(REQUIRED_RUNTIME_IMAGE_COMMANDS) ; \
fail=0 ; \
if [[ $$image == "" ]] ; then \
echo "Usage: make validate-runtime-image image=<container-image-name>" ; \
exit 1 ; \
Expand All @@ -522,11 +541,12 @@ validate-runtime-image: bin/kubectl
fi; \
if [ $$cmd == "python3" ]; then \
echo "=> Checking notebook execution..." ; \
$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "python3 -m pip install -r /opt/app-root/elyra/requirements-elyra.txt && \
$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "curl https://raw.githubusercontent.com/opendatahub-io/elyra/refs/heads/main/etc/generic/requirements-elyra.txt --output req.txt && \
python3 -m pip install -r req.txt > /dev/null && \
curl https://raw.githubusercontent.com/nteract/papermill/main/papermill/tests/notebooks/simple_execute.ipynb --output simple_execute.ipynb && \
python3 -m papermill simple_execute.ipynb output.ipynb > /dev/null" ; \
if [ $$? -ne 0 ]; then \
echo "ERROR: Image does not meet Python requirements criteria in requirements-elyra.txt" ; \
echo "ERROR: Image does not meet Python requirements criteria in pipfile" ; \
fail=1; \
fi; \
fi; \
Expand Down
22 changes: 22 additions & 0 deletions ci/cached-builds/11-crio-ipv4-bridge.conflist
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"cniVersion": "1.0.0",
"name": "crio",
"plugins": [
{
"type": "bridge",
"bridge": "cni0",
"isGateway": true,
"ipMasq": true,
"hairpinMode": true,
"ipam": {
"type": "host-local",
"routes": [
{ "dst": "0.0.0.0/0" }
],
"ranges": [
[{ "subnet": "10.85.0.0/16" }]
]
}
}
]
}
5 changes: 5 additions & 0 deletions ci/cached-builds/containers.conf
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@ retry=100
# supposedly these images are faster to pull
compression_format="zstd:chunked"
compression_level=6

# defaults to /var/tmp, which is small
image_copy_tmp_dir="storage"
# setting image_copy_tmp_dir is not enough, it still can give me
# Error: creating build container: writing blob: storing blob to file "/var/tmp/container_images_storage2384030476/20": write /var/tmp/container_images_storage2384030476/20: no space left on device
# https://github.com/containers/podman/issues/5411, https://github.com/containers/podman/pull/5412
# Set the TMPDIR env variable, https://github.com/containers/podman/blob/d85ac938e60938369ff1337dccaf0943b7405f48/cmd/podman/images/load.go#L96

[machine]

Expand Down
17 changes: 17 additions & 0 deletions ci/cached-builds/crio.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md

[crio]
storage_driver = "overlay"
# storage_option = [ "overlay.mountopt=nodev,metacopy=on" ]

# reuse podman's container storage because we have huge images that don't fit on disk twice
root = "/home/runner/.local/share/containers/storage"
# has to be the same as root!
runroot = "/home/runner/.local/share/containers/storage"

# https://stackoverflow.com/questions/62408028/kubelet-failed-to-createpodsandbox-for-coredns-failed-to-set-bridge-addr-c
[crio.network]
# the /etc/cni/net.d/11-crio-ipv4-bridge.conflist default IPs confilct with flannel,
# older versions of kubernetes the kubelet was touching the cni, now only the container runtime touches
# c.f. https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/#installation
#network_dir = "/etc/cni/net.d-kube/"
4 changes: 3 additions & 1 deletion ci/cached-builds/gha_lvm_overlay.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ if [[ ${overprovision_lvm} == 'true' ]]; then
else
sudo mkfs.ext4 -Enodiscard -m0 "/dev/mapper/${VG_NAME}-buildlv"
fi
sudo mount "/dev/mapper/${VG_NAME}-buildlv" "${build_mount_path}"
mkdir -p "${build_mount_path}"
# https://www.alibabacloud.com/help/en/ecs/use-cases/mount-parameters-for-ext4-file-systems?spm=a2c63.p38356.help-menu-25365.d_5_10_12.48ce3be5RixoUB#8e740ed072m5o
sudo mount -o defaults,noatime,nodiratime,nobarrier,nodelalloc,data=writeback "/dev/mapper/${VG_NAME}-buildlv" "${build_mount_path}"
sudo chown -R "${build_mount_path_ownership}" "${build_mount_path}"

# if build mount path is a parent of $GITHUB_WORKSPACE, and has been deleted, recreate it
Expand Down
58 changes: 58 additions & 0 deletions ci/cached-builds/has_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
import argparse
import json
import os
import pathlib
import typing
import unittest

import gha_pr_changed_files

"""Determines whether we have deploy Makefile tests for this target or not
https://github.com/openshift/release/blob/master/ci-operator/config/opendatahub-io/notebooks/opendatahub-io-notebooks-main.yaml#L1485
"""


class Args(argparse.Namespace):
"""Type annotation to have autocompletion for args"""
target: str


def main() -> None:
parser = argparse.ArgumentParser("make_test.py")
parser.add_argument("--target", type=str)
args = typing.cast(Args, parser.parse_args())

has_tests = check_tests(args.target)

if "GITHUB_ACTIONS" in os.environ:
with open(os.environ["GITHUB_OUTPUT"], "at") as f:
print(f"tests={json.dumps(has_tests)}", file=f)

print(f"{has_tests=}")


def check_tests(target: str) -> bool:
if target.startswith("rocm-jupyter-minimal-") or target.startswith("rocm-jupyter-datascience-"):
return False # we don't have specific tests for -minimal-, ... in ci-operator/config
if '-intel-' in target:
return False # RHOAIENG-8388: Intel tensorflow notebook failed to get tested on OCP-CI

has_tests = False
dirs = gha_pr_changed_files.analyze_build_directories(target)
for d in reversed(dirs): # (!)
kustomization = pathlib.Path(gha_pr_changed_files.PROJECT_ROOT) / d / "kustomize/base/kustomization.yaml"
has_tests = has_tests or kustomization.is_file()
break # TODO: check only the last directory (the top level layer) for now
return has_tests


class TestCheckTests(unittest.TestCase):
def test_has_tests(self):
assert check_tests("base-c9s-python-3.11") is False
assert check_tests("jupyter-minimal-ubi9-python-3.9") is True


if __name__ == "__main__":
main()
11 changes: 0 additions & 11 deletions ci/cached-builds/homebrew.podman.service

This file was deleted.

45 changes: 45 additions & 0 deletions ci/cached-builds/kubeadm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
---
# kubeadm config print init-defaults > kubeadm.yaml
# kubeadm init --cri-socket=/var/run/crio/crio.sock

# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/
# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta4/
apiVersion: kubeadm.k8s.io/v1beta3
bootstrapTokens:
- groups:
- system:bootstrappers:kubeadm:default-node-token
token: abcdef.0123456789abcdef
ttl: 24h0m0s
usages:
- signing
- authentication
kind: InitConfiguration
localAPIEndpoint:
bindPort: 6443
nodeRegistration:
kubeletExtraArgs:
# Need to have enough disk space for Kubelet, so move root-dir on the LVM volume
# Note: the internets discourage from changing the default because storage plugins may then struggle
# https://cep.dev/posts/adventure-trying-change-kubelet-rootdir/
root-dir: "/home/runner/.local/share/containers/kubelet-root-dir"
criSocket: unix:///var/run/crio/crio.sock
imagePullPolicy: IfNotPresent
taints: null
---
apiServer:
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controllerManager: {}
dns: {}
etcd:
local:
dataDir: /var/lib/etcd
imageRepository: registry.k8s.io
kind: ClusterConfiguration
networking:
dnsDomain: cluster.local
# this matches the default in /etc/cni/net.d/11-crio-ipv4-bridge.conflist
podSubnet: 10.85.0.0/16
scheduler: {}
20 changes: 20 additions & 0 deletions ci/cached-builds/podman.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# https://docs.podman.io/en/latest/markdown/podman-system-service.1.html
# cat /usr/lib/systemd/system/podman.socket

[Unit]
Description=Podman API Service
Requires=podman.socket
After=podman.socket
Documentation=man:podman-system-service(1)
StartLimitIntervalSec=0

[Service]
Delegate=true
Type=exec
KillMode=process
Environment="PATH=/home/linuxbrew/.linuxbrew/bin:/home/linuxbrew/.linuxbrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin"
WorkingDirectory=/home/linuxbrew/.linuxbrew
ExecStart=/home/linuxbrew/.linuxbrew/opt/podman/bin/podman --log-level=info system service

[Install]
WantedBy=default.target
12 changes: 12 additions & 0 deletions ci/cached-builds/podman.socket
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# cat /usr/lib/systemd/system/podman.socket

[Unit]
Description=Podman API Socket
Documentation=man:podman-system-service(1)

[Socket]
ListenStream=%t/podman/podman.sock
SocketMode=0666

[Install]
WantedBy=sockets.target
6 changes: 6 additions & 0 deletions ci/cached-builds/registries.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# https://github.com/containers/image/blob/main/docs/containers-registries.conf.5.md

# prevent this kyverno error
# Failed to pull image "bitnami/kubectl:1.26.4": reading manifest 1.26.4 in quay.io/bitnami/kubectl: unauthorized: access to the requested resource is not authorized
unqualified-search-registries = [ "docker.io" ]
short-name-mode = "enforcing"
10 changes: 9 additions & 1 deletion ci/cached-builds/storage.conf
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
# https://github.com/containers/storage/blob/main/docs/containers-storage.conf.5.md

# Multiple users sharing the same containers/storage is not "supported" as it tends to cause
# various permission issues on the host or wrong uid/gids in the containers. C.f.
# https://access.redhat.com/solutions/6986565
[storage]
driver="overlay"
driver = "overlay"

graphroot = "/home/runner/.local/share/containers/storage"
runroot = "/home/runner/.local/share/containers/storage"

transient_store = true

[storage.options]
# https://www.redhat.com/sysadmin/faster-container-image-pulls
Expand Down
5 changes: 5 additions & 0 deletions ci/yamllint-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,8 @@ rules:
line-length: disable
new-line-at-end-of-file:
level: warning

document-start:
ignore:
# generated file
- 'pnpm-lock.yaml'
2 changes: 1 addition & 1 deletion intel/runtimes/ml/ubi9-python-3.11/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ plotly = "~=5.16.1"
scipy = "~=1.11.2"
scikit-learn = "~=1.3.1"
skl2onnx = "~=1.15.0"
codeflare-sdk = "~=0.23.1"
codeflare-sdk = "~=0.24.0"
# DB connectors
pymongo = "~=4.5.0"
psycopg = "~=3.1.10"
Expand Down
Loading

0 comments on commit c8bf0bd

Please sign in to comment.