diff --git a/build/ci/docker-deploy/docker_deploy.sh b/build/ci/docker-deploy/docker_deploy.sh index f3ac5db25..b9532f916 100644 --- a/build/ci/docker-deploy/docker_deploy.sh +++ b/build/ci/docker-deploy/docker_deploy.sh @@ -21,11 +21,11 @@ cd ${target_dir} tar -xzf confs-${target_party_id}.tar cd confs-${target_party_id} -docker-compose down +docker compose down docker volume rm -f confs-${target_party_id}_shared_dir_examples docker volume rm -f confs-${target_party_id}_shared_dir_federatedml # exclude client service to save time ! -docker-compose up -d +docker compose up -d cd ../ rm -f confs-${target_party_id}.tar @@ -34,8 +34,8 @@ echo "# party ${target_party_id} training cluster deploy is ok!" echo "# serving cluster deploy begin" tar -xzf serving-${target_party_id}.tar cd serving-${target_party_id} -docker-compose down -docker-compose up -d +docker compose down +docker compose up -d cd ../ rm -f serving-${target_party_id}.tar diff --git a/docker-deploy/.env b/docker-deploy/.env index cef87147c..a8c9bd219 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -1,5 +1,5 @@ RegistryURI= -TAG=1.10.0-release +TAG=1.11.1-release SERVING_TAG=2.1.6-release SSH_PORT=22 diff --git a/docker-deploy/README.md b/docker-deploy/README.md index 2a2ab5c36..71e04b4e1 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -7,8 +7,8 @@ This guide describes the process of deploying FATE using Docker Compose. The nodes (target nodes) to install FATE must meet the following requirements: 1. A Linux host -2. Docker: 18+ -3. Docker-Compose: 1.24+ +2. Docker: 19.03.0+ +3. Docker Compose: 1.27.0+ 4. The deployment machine have access to the Internet, so the hosts can communicate with each other; 5. Network connection to Internet to pull container images from Docker Hub. If network connection to Internet is not available, consider to set up [Harbor as a local registry](../registry/README.md) or use [offline images](https://github.com/FederatedAI/FATE/tree/master/build/docker-build). 6. A host running FATE is recommended to be with 8 CPUs and 16G RAM. @@ -117,6 +117,23 @@ bash ./generate_config.sh Now, tar files have been generated for each party including the exchange node (party). They are named as ```confs-.tar``` and ```serving-.tar```. +### GPU support + +Starting from v1.11.1, docker compose deployment supports FATE deployment using GPU. If you want to use GPU, you need to get the docker environment of GPU first. You can refer to the official documentation of docker (). + +To use the GPU, you need to modify the configuration, both of which need to be modified + +```sh +algorithm=NN +device=GPU + +gpu_count=1 +``` + +Only the fateflow component is used for FATE GPU, so each Party needs at least one GPU. + +*gpu_count will be mapped to count, refer to [Docker compose GPU support](https://docs.docker.com/compose/gpu-support/)* + ### Deploying FATE to target hosts **Note:** Before running the below commands, all target hosts must @@ -166,12 +183,12 @@ CONTAINER ID IMAGE COMMAND 3dca43f3c9d5 federatedai/serving-admin:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8350->8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.10.0-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.10.0-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.10.0-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 +3c643324066f federatedai/client:1.11.1-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 +3fe0af1ebd71 federatedai/fateboard:1.11.1-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 +635b7d99357e federatedai/fateflow:1.11.1-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 +8b515f08add3 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 +108cc061c191 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 +f10575e76899 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 ``` @@ -474,6 +491,6 @@ To delete the cluster completely, log in to each host and run the commands as fo ```bash cd /data/projects/fate/confs-/ # id of party -docker-compose down +docker compose down rm -rf ../confs-/ # delete the legacy files ``` diff --git a/docker-deploy/README_zh.md b/docker-deploy/README_zh.md index 2b222f4fa..20da699f2 100644 --- a/docker-deploy/README_zh.md +++ b/docker-deploy/README_zh.md @@ -17,8 +17,8 @@ Compose是用于定义和运行多容器Docker应用程序的工具。通过Comp ## 准备工作 1. 两个主机(物理机或者虚拟机,都是Centos7系统); -2. 所有主机安装Docker 版本 : 18+; -3. 所有主机安装Docker-Compose 版本: 1.24+; +2. 所有主机安装Docker 版本 : 19.03.0+; +3. 所有主机安装Docker Compose 版本: 1.27.0+; 4. 部署机可以联网,所以主机相互之间可以网络互通; 5. 运行机已经下载FATE的各组件镜像,如果无法连接dockerhub,请考虑使用harbor([Harbor 作为本地镜像源](../registry/README.md))或者使用离线部署(离线构建镜像参考文档[构建镜像](https://github.com/FederatedAI/FATE/tree/master/build/docker-build))。 6. 运行FATE的主机推荐配置8CPUs和16G RAM。 @@ -138,9 +138,9 @@ compute_core=4 # 设置用户密码 [user@localhost]$ sudo passwd fate # 创建docker-compose部署目录 -[user@localhost]$ sudo mkdir -p /data/projects/fate +[user@localhost]$ sudo mkdir -p /data/projects/fate /home/fate # 修改docker-compose部署目录对应用户和组 -[user@localhost]$ sudo chown -R fate:docker /data/projects/fate +[user@localhost]$ sudo chown -R fate:docker /data/projects/fate /home/fate # 选择用户 [user@localhost]$ sudo su fate # 查看是否拥有docker权限 @@ -152,6 +152,23 @@ total 0 drwxr-xr-x. 2 fate docker 6 May 27 00:51 fate ``` +### GPU支持 + +从v1.11.1开始docker compose部署支持使用GPU的FATE部署,如果要使用GPU,你需要先搞定GPU的docker环境。可以参考docker的官方文档()。 + +要使用GPU需要修改配置,这两个都需要修改 + +```sh +algorithm=NN +device=GPU + +gpu_count=1 +``` + +FATE GPU的使用只有fateflow组件,所以每个Party最少需要有一个GPU。 + +*gpu_count会映射为count,参考 [Docker compose GPU support](https://docs.docker.com/compose/gpu-support/)* + ### 执行部署脚本 以下修改可在任意机器执行。 @@ -185,12 +202,12 @@ CONTAINER ID IMAGE COMMAND 3dca43f3c9d5 federatedai/serving-admin:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8350->8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.10.0-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.10.0-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.10.0-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 +3c643324066f federatedai/client:1.11.1-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 +3fe0af1ebd71 federatedai/fateboard:1.11.1-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 +635b7d99357e federatedai/fateflow:1.11.1-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 +8b515f08add3 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 +108cc061c191 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 +f10575e76899 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 ``` diff --git a/docker-deploy/docker_deploy.sh b/docker-deploy/docker_deploy.sh index 7ef6275c5..7c4c10a39 100755 --- a/docker-deploy/docker_deploy.sh +++ b/docker-deploy/docker_deploy.sh @@ -163,10 +163,10 @@ mv ~/confs-$target_party_id.tar $dir cd $dir tar -xzf confs-$target_party_id.tar cd confs-$target_party_id -docker-compose down +docker compose down docker volume rm -f confs-${target_party_id}_shared_dir_examples docker volume rm -f confs-${target_party_id}_shared_dir_federatedml -docker-compose up -d +docker compose up -d cd ../ rm -f confs-${target_party_id}.tar exit @@ -214,8 +214,8 @@ mv ~/serving-$target_party_id.tar $dir cd $dir tar -xzf serving-$target_party_id.tar cd serving-$target_party_id -docker-compose down -docker-compose up -d +docker compose down +docker compose up -d cd ../ rm -f serving-$target_party_id.tar exit @@ -250,7 +250,7 @@ DeleteCluster() { if [ "$cluster_type" == "--training" ]; then ssh -p ${SSH_PORT} -tt $user@$target_party_ip <) + +## How to configure + +The algorithm and device in cluster.yaml must be changed to NN and GPU. (Currently FATE only supports the use of GPU for NN algorithms) + +```yaml +algorithm: NN +device: GPU +``` + +Then the resource of the python pod is allocated at least 1 GPU resource. (GPU computing is only in the pod of fateflow) + +```bash +python: + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 +``` + +Here is an example [cluster-gpu.yaml](../k8s-deploy/examples/party-9999/cluster-gpu.yaml). + +Then deploy the cluster defined by cluster.yaml, and you can use FATE to run GPU tasks. diff --git a/docs/Manage_FATE_and_FATE-Serving_Version.md b/docs/Manage_FATE_and_FATE-Serving_Version.md index 084688989..e7f76c3ad 100644 --- a/docs/Manage_FATE_and_FATE-Serving_Version.md +++ b/docs/Manage_FATE_and_FATE-Serving_Version.md @@ -30,18 +30,18 @@ The chart can be downloaded in each KubeFATE release, with name `fate-{release_v Download it and copy it to the folder to upload. ``` -$ kubefate chart upload -f ./fate-v1.10.0.tgz +$ kubefate chart upload -f ./fate-v1.11.1.tgz Upload file success $ kubefate chart ls UUID NAME VERSION APPVERSION -ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.10.0 v1.10.0 +ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.11.1 v1.11.1 ``` -Then, we can deploy the fate cluster of v1.10.0 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) +Then, we can deploy the fate cluster of v1.11.1 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) ``` chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 ``` We can delete the chart with: diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 10b8e6b10..c032b1a81 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -21,14 +21,14 @@ After the tutorial, the deployment architecture looks like the following diagram 5. Network connectivity to dockerhub or 163 Docker Image Registry, and google gcr. 6. Setup the global KubeFATE version using in the tutorial and create a folder for the whole tutorial. ``` -export fate_version=v1.10.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.11.1 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * When talking about KubeFATE version, usually there are 3 notions: * The KubeFATE CLI version, in this tutorial, it is v1.4.5. * The KubeFATE service version, in this tutorial, it is v1.4.5. - * The FATE version, in this tutorial, it is v1.10.0, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. + * The FATE version, in this tutorial, it is v1.11.1, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.** # Start Tutorial @@ -87,7 +87,7 @@ When all the pods are in the ready state, it means your Kubernetes cluster is re ## Setup Kubefate ### Install KubeFATE CLI Go to [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases), and find the latest kubefate-k8s release -pack, which is `v1.10.0` as set to ENVs before. (replace ${fate_version} with the newest version available) +pack, which is `v1.11.1` as set to ENVs before. (replace ${fate_version} with the newest version available) ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -256,7 +256,7 @@ For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as foll name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -340,7 +340,7 @@ and for fate-10000: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: @@ -440,8 +440,8 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.10.0 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.10.0 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.1 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.1 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. An alternative way is offline loading the images to the local environment. @@ -479,13 +479,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.10.0 +ChartVersion v1.11.1 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.10.0 + chartVersion: v1.11.1 computing: Spark device: CPU federation: Pulsar diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index de43ed832..24324d887 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -17,14 +17,14 @@ 5. 要保证安装机器可以正常访问Docker Hub或者网易云镜像仓库,以及Google gcr; 6. 预先创建一个目录,以便整个过程使用该目录作为工作目录,命令如下: ``` -export fate_version=v1.10.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.11.1 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 * KubeFATE服务版本,在本教程中为v1.4.5。 - * FATE版本,在本教程中v1.10.0,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 + * FATE版本,在本教程中v1.11.1,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** # 开始安装 @@ -77,7 +77,7 @@ sudo minikube addons enable ingress ## 安装Kubefate ### 下载KubeFATE命令行工具 -我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.10.0`, +我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.11.1`, ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -237,7 +237,7 @@ kubectl -n fate-10000 create secret docker-registry myregistrykey \ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -322,7 +322,7 @@ pulsar: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: @@ -418,8 +418,8 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.10.0 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.10.0 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.1 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.1 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 检查下载的进度可以用 @@ -446,13 +446,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.10.0 +ChartVersion v1.11.1 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.10.0 + chartVersion: v1.11.1 computing: Spark device: CPU federation: Pulsar diff --git a/helm-charts/FATE-Exchange/Chart.yaml b/helm-charts/FATE-Exchange/Chart.yaml index 253a79e81..c45169267 100644 --- a/helm-charts/FATE-Exchange/Chart.yaml +++ b/helm-charts/FATE-Exchange/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v1 -appVersion: v1.10.0 +appVersion: v1.11.1 description: A Helm chart for fate exchange name: fate-exchange -version: v1.10.0 +version: v1.11.1 diff --git a/helm-charts/FATE-Exchange/values-template-example.yaml b/helm-charts/FATE-Exchange/values-template-example.yaml index c711d9ac4..81fbb016d 100644 --- a/helm-charts/FATE-Exchange/values-template-example.yaml +++ b/helm-charts/FATE-Exchange/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 1 registry: "" pullPolicy: diff --git a/helm-charts/FATE-Exchange/values.yaml b/helm-charts/FATE-Exchange/values.yaml index 1c61523d8..2f508dbd2 100644 --- a/helm-charts/FATE-Exchange/values.yaml +++ b/helm-charts/FATE-Exchange/values.yaml @@ -4,7 +4,7 @@ partyName: fate-exchange image: registry: federatedai isThridParty: - tag: 1.10.0-release + tag: 1.11.1-release pullPolicy: IfNotPresent imagePullSecrets: # - name: diff --git a/helm-charts/FATE/Chart.yaml b/helm-charts/FATE/Chart.yaml index 85a11ebf7..18ea6706c 100644 --- a/helm-charts/FATE/Chart.yaml +++ b/helm-charts/FATE/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 -appVersion: v1.10.0 +appVersion: v1.11.1 description: A Helm chart for fate-training name: fate -version: v1.10.0 +version: v1.11.1 home: https://fate.fedai.org icon: https://aisp-1251170195.cos.ap-hongkong.myqcloud.com/wp-content/uploads/sites/12/2019/09/logo.png sources: diff --git a/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl b/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl index c35a3cc4d..8abd2d17b 100644 --- a/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl +++ b/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl @@ -19,4 +19,7 @@ {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} +{{- if eq .Values.device "GPU" -}} +-gpu +{{- end -}} {{- end -}} diff --git a/helm-charts/FATE/templates/backends/spark/_helpers.tpl b/helm-charts/FATE/templates/backends/spark/_helpers.tpl index 6df82b044..80228c561 100644 --- a/helm-charts/FATE/templates/backends/spark/_helpers.tpl +++ b/helm-charts/FATE/templates/backends/spark/_helpers.tpl @@ -19,4 +19,7 @@ {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} +{{- if eq .Values.device "GPU" -}} +-gpu +{{- end -}} {{- end -}} diff --git a/helm-charts/FATE/templates/core/_helpers.tpl b/helm-charts/FATE/templates/core/_helpers.tpl index 306c582a9..5fbd730ac 100644 --- a/helm-charts/FATE/templates/core/_helpers.tpl +++ b/helm-charts/FATE/templates/core/_helpers.tpl @@ -21,4 +21,7 @@ {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} +{{- if eq .Values.device "GPU" -}} +-gpu +{{- end -}} {{- end -}} diff --git a/helm-charts/FATE/templates/core/fateflow/configmap.yaml b/helm-charts/FATE/templates/core/fateflow/configmap.yaml index edc8c4744..aba22c4ee 100644 --- a/helm-charts/FATE/templates/core/fateflow/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateflow/configmap.yaml @@ -213,62 +213,7 @@ data: password: fate {{- end }} {{- end }} - transfer_conf.yaml: | - paths: # dir or path - - "python/federatedml/transfer_variable/auth_conf" - component_registry.json: | - { - "components": { - }, - "providers": { - }, - "default_settings": { - "fate_flow":{ - "default_version_key": "FATEFlow" - }, - "fate": { - "default_version_key": "FATE" - }, - "class_path": { - "interface": "components.components.Components", - "feature_instance": "feature.instance.Instance", - "feature_vector": "feature.sparse_vector.SparseVector", - "model": "protobuf.generated", - "model_migrate": "protobuf.model_migrate.model_migrate", - "homo_model_convert": "protobuf.homo_model_convert.homo_model_convert" - } - } - } - job_default_config.yaml: | - # component provider, relative path to get_fate_python_directory - default_component_provider_path: federatedml - - # resource - total_cores_overweight_percent: 1 # 1 means no overweight - total_memory_overweight_percent: 1 # 1 means no overweight - task_parallelism: 1 - task_cores: 4 - task_memory: 0 # mb - max_cores_percent_per_job: 1 # 1 means total - - # scheduling - job_timeout: 259200 # s - remote_request_timeout: 30000 # ms - federated_command_trys: 3 - end_status_job_scheduling_time_limit: 300000 # ms - end_status_job_scheduling_updates: 1 - auto_retries: {{ .Values.modules.python.failedTaskAutoRetryTimes }} - auto_retry_delay: {{ .Values.modules.python.failedTaskAutoRetryDelay }} #seconds - # It can also be specified in the job configuration using the federated_status_collect_type parameter - federated_status_collect_type: PUSH - detect_connect_max_retry_count: 3 - detect_connect_long_retry_count: 2 - - # upload - upload_max_bytes: 104857600 # bytes - - #component output - output_data_summary_count_limit: 100 + --- kind: ConfigMap apiVersion: v1 diff --git a/helm-charts/FATE/templates/core/python-spark.yaml b/helm-charts/FATE/templates/core/python-spark.yaml index 81663bb83..766a37b68 100644 --- a/helm-charts/FATE/templates/core/python-spark.yaml +++ b/helm-charts/FATE/templates/core/python-spark.yaml @@ -118,10 +118,7 @@ spec: - | set -x mkdir -p /data/projects/fate/conf/ - cp /data/projects/fate/conf-tmp/transfer_conf.yaml /data/projects/fate/conf/transfer_conf.yaml cp /data/projects/fate/conf-tmp/service_conf.yaml /data/projects/fate/conf/service_conf.yaml - cp /data/projects/fate/conf-tmp/component_registry.json /data/projects/fate/fateflow/conf/component_registry.json - cp /data/projects/fate/conf-tmp/job_default_config.yaml /data/projects/fate/fateflow/conf/job_default_config.yaml # fix fateflow conf must use IP sed -i "s/host: fateflow_ip/host: ${POD_IP}/g" /data/projects/fate/conf/service_conf.yaml diff --git a/helm-charts/FATE/values-template-example.yaml b/helm-charts/FATE/values-template-example.yaml index 3e4770d0e..591d881c5 100644 --- a/helm-charts/FATE/values-template-example.yaml +++ b/helm-charts/FATE/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -35,7 +35,7 @@ federation: Eggroll storage: Eggroll # Algorithm: [Basic, NN] algorithm: Basic -# Device: [IPCL, CPU] +# Device: [IPCL, CPU, GPU] device: CPU skippedKeys: @@ -274,7 +274,7 @@ skippedKeys: # spark: # master: # Image: "federatedai/spark-master" - # ImageTag: "1.10.0-release" + # ImageTag: "1.11.1-release" # replicas: 1 # resources: # requests: @@ -290,7 +290,7 @@ skippedKeys: # nodePort: 30977 # worker: # Image: "federatedai/spark-worker" - # ImageTag: "1.10.0-release" + # ImageTag: "1.11.1-release" # replicas: 2 # resources: # requests: diff --git a/helm-charts/FATE/values.yaml b/helm-charts/FATE/values.yaml index 84515aec4..5ac93be7b 100644 --- a/helm-charts/FATE/values.yaml +++ b/helm-charts/FATE/values.yaml @@ -2,7 +2,7 @@ image: registry: federatedai isThridParty: - tag: 1.10.0-release + tag: 1.11.1-release pullPolicy: IfNotPresent imagePullSecrets: # - name: @@ -18,7 +18,7 @@ federation: Eggroll storage: Eggroll # Algorithm: Basic, NN algorithm: Basic -# Device: CPU, IPCL +# Device: CPU, IPCL, GPU device: IPCL istio: @@ -274,8 +274,8 @@ modules: type: ClusterIP resources: requests: - cpu: "2" - memory: "4Gi" + cpu: "4" + memory: "8Gi" hdfs: include: true namenode: diff --git a/helm-charts/UpgradeManager/values.yaml b/helm-charts/UpgradeManager/values.yaml index b49d24dbb..55d81da63 100644 --- a/helm-charts/UpgradeManager/values.yaml +++ b/helm-charts/UpgradeManager/values.yaml @@ -1,4 +1,4 @@ username: fate password: fate_dev -start: 1.10.0 -target: 1.10.0 \ No newline at end of file +start: v1.11.1 +target: v1.11.1 \ No newline at end of file diff --git a/k8s-deploy/README.md b/k8s-deploy/README.md index 13c33bf5d..d3b0fbb2e 100644 --- a/k8s-deploy/README.md +++ b/k8s-deploy/README.md @@ -188,13 +188,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.10.0 +ChartVersion v1.11.1 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.10.0 + chartVersion: v1.11.1 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/README_zh.md b/k8s-deploy/README_zh.md index af3d0e247..a477d51e1 100644 --- a/k8s-deploy/README_zh.md +++ b/k8s-deploy/README_zh.md @@ -142,6 +142,10 @@ create job success, job id=d92d7a56-7002-46a4-9363-da9c7346e05a 3. Rabbitmq。 4. Pulsar。 +### GPU 支持 + +从v1.11.1开始,KubeFATE可以部署支持GPU的FATE集群,部署支持GPU的FATE需要有一些特别准备工作和配置,可以查看这个文档[KubeFATE 部署支持GPU的FATE](../) + ### 检查安装集群任务的状态 上面的命令会创建一个安装FATE集群的任务,用于异步部署。使用```kubefate job describe```命令可以检查任务的状态,直到看到结果为`install success` @@ -187,13 +191,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.10.0 +ChartVersion v1.11.1 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.10.0 + chartVersion: v1.11.1 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/cluster-spark-pulsar.yaml b/k8s-deploy/cluster-spark-pulsar.yaml index df9b1ad16..28b79bd55 100644 --- a/k8s-deploy/cluster-spark-pulsar.yaml +++ b/k8s-deploy/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -31,7 +31,7 @@ federation: Pulsar storage: HDFS # Algorithm: Basic, NN algorithm: Basic -# Device: CPU, IPCL +# Device: CPU, IPCL GPU device: CPU # you can customize some keys which will be ignored in yaml validation @@ -72,9 +72,11 @@ skippedKeys: # requests: # cpu: "2" # memory: "4Gi" + # nvidia.com/gpu: 1 # limits: # cpu: "4" # memory: "8Gi" + # nvidia.com/gpu: 1 # spark: # cores_per_node: 20 # nodes: 2 @@ -126,7 +128,7 @@ skippedKeys: # spark: # master: # Image: "federatedai/spark-master" - # ImageTag: "1.6.1-release" + # ImageTag: "1.11.1-release" # replicas: 1 # resources: # requests: @@ -142,7 +144,7 @@ skippedKeys: # nodePort: 30977 # worker: # Image: "federatedai/spark-worker" - # ImageTag: "1.6.1-release" + # ImageTag: "1.11.1-release" # replicas: 2 # resources: # requests: diff --git a/k8s-deploy/cluster-spark-rabbitmq.yaml b/k8s-deploy/cluster-spark-rabbitmq.yaml index b4037cb12..025928409 100644 --- a/k8s-deploy/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -31,7 +31,7 @@ federation: RabbitMQ storage: HDFS # Algorithm: Basic, NN algorithm: Basic -# Device: CPU, IPCL +# Device: CPU, IPCL GPU device: CPU # you can customize some keys which will be ignored in yaml validation @@ -56,9 +56,27 @@ skippedKeys: # Specify the fateflow service's properties # python: # type: NodePort - # nodePort: 30102 - # nodeSelector: + # httpNodePort: 30097 + # grpcNodePort: 30092 + # loadBalancerIP: + # serviceAccountName: "" + # nodeSelector: + # tolerations: + # affinity: # logLevel: INFO + # existingClaim: "" + # storageClass: "python" + # accessMode: ReadWriteMany + # size: 1Gi + # resources: + # requests: + # cpu: "2" + # memory: "4Gi" + # nvidia.com/gpu: 1 + # limits: + # cpu: "4" + # memory: "8Gi" + # nvidia.com/gpu: 1 # spark: # cores_per_node: 20 # nodes: 2 diff --git a/k8s-deploy/cluster-spark-slim.yaml b/k8s-deploy/cluster-spark-slim.yaml index 653c78065..eb2145b0f 100644 --- a/k8s-deploy/cluster-spark-slim.yaml +++ b/k8s-deploy/cluster-spark-slim.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -29,7 +29,7 @@ federation: Pulsar storage: LocalFS # Algorithm: Basic, NN algorithm: Basic -# Device: CPU, IPCL +# Device: CPU, IPCL GPU device: CPU # you can customize some keys which will be ignored in yaml validation @@ -54,9 +54,27 @@ skippedKeys: # Specify the fateflow service's properties # python: # type: NodePort - # nodePort: 30102 + # httpNodePort: 30097 + # grpcNodePort: 30092 + # loadBalancerIP: + # serviceAccountName: "" # nodeSelector: + # tolerations: + # affinity: # logLevel: INFO + # existingClaim: "" + # storageClass: "python" + # accessMode: ReadWriteMany + # size: 1Gi + # resources: + # requests: + # cpu: "2" + # memory: "4Gi" + # nvidia.com/gpu: 1 + # limits: + # cpu: "4" + # memory: "8Gi" + # nvidia.com/gpu: 1 # spark: # cores_per_node: 20 # nodes: 2 diff --git a/k8s-deploy/cluster.yaml b/k8s-deploy/cluster.yaml index e3858d754..6fef5aeec 100644 --- a/k8s-deploy/cluster.yaml +++ b/k8s-deploy/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: @@ -30,7 +30,7 @@ federation: Eggroll storage: Eggroll # Algorithm: Basic, NN algorithm: Basic -# Device: CPU, IPCL +# Device: CPU, IPCL GPU device: CPU # you can customize some keys which will be ignored in yaml validation @@ -138,9 +138,11 @@ skippedKeys: # requests: # cpu: "2" # memory: "4Gi" + # nvidia.com/gpu: 1 # limits: # cpu: "4" # memory: "8Gi" + # nvidia.com/gpu: 1 # clustermanager: # cores_per_node: 16 # nodes: 2 diff --git a/k8s-deploy/examples/config.sh b/k8s-deploy/examples/config.sh index 087a9d2bd..96a77fb24 100644 --- a/k8s-deploy/examples/config.sh +++ b/k8s-deploy/examples/config.sh @@ -18,70 +18,34 @@ if [[ "$unamestr" == "Darwin" ]] ; then exit 1; } fi -# 9999 config -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" party-9999/cluster.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_serving_chartVersion}/g" ./party-9999/cluster-serving.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-9999/cluster-spark-rabbitmq.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-9999/cluster-spark-pulsar.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-9999/cluster-spark-local-pulsar.yaml - -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/cluster.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/cluster-serving.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/cluster-spark-pulsar.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/cluster-spark-local-pulsar.yaml - -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/cluster.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/cluster-serving.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/cluster-spark-pulsar.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/cluster-spark-local-pulsar.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/cluster.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/cluster-serving.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/cluster-spark-pulsar.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/cluster-spark-local-pulsar.yaml +# 9999 config for item in `ls party-9999` do - $SED -i "s/algorithm: .*/algorithm: ${algorithm}/g" ./party-9999/$item - $SED -i "s/device: .*/device: ${device}/g" ./party-9999/$item + $SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" party-9999/cluster.yaml + $SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-9999/$item + $SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-9999/$item + $SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-9999/$item done -# 10000 config +$SED -i "s/chartVersion: .*/chartVersion: ${fate_serving_chartVersion}/g" ./party-9999/cluster-serving.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-10000/cluster.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_serving_chartVersion}/g" ./party-10000/cluster-serving.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-10000/cluster-spark-rabbitmq.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-10000/cluster-spark-pulsar.yaml -$SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-10000/cluster-spark-local-pulsar.yaml - -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/cluster.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/cluster-serving.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/cluster-spark-pulsar.yaml -$SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/cluster-spark-local-pulsar.yaml - -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/cluster.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/cluster-serving.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/cluster-spark-pulsar.yaml -$SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/cluster-spark-local-pulsar.yaml - -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/cluster.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/cluster-serving.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/cluster-spark-rabbitmq.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/cluster-spark-pulsar.yaml -$SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/cluster-spark-local-pulsar.yaml + +# 10000 config for item in `ls party-10000` do - $SED -i "s/algorithm: .*/algorithm: ${algorithm}/g" ./party-10000/$item - $SED -i "s/device: .*/device: ${device}/g" ./party-10000/$item + $SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-10000/$item + $SED -i "s/192.168.9.1/${party_9999_IP}/g" ./party-10000/$item + $SED -i "s/192.168.10.1/${party_10000_IP}/g" ./party-10000/$item + $SED -i "s/192.168.0.1/${party_exchange_IP}/g" ./party-10000/$item done +$SED -i "s/chartVersion: .*/chartVersion: ${fate_serving_chartVersion}/g" ./party-10000/cluster-serving.yaml + + # exchange config $SED -i "s/chartVersion: .*/chartVersion: ${fate_chartVersion}/g" ./party-exchange/rollsite.yaml diff --git a/k8s-deploy/examples/party-10000/cluster-gpu.yaml b/k8s-deploy/examples/party-10000/cluster-gpu.yaml new file mode 100644 index 000000000..6e2f8a318 --- /dev/null +++ b/k8s-deploy/examples/party-10000/cluster-gpu.yaml @@ -0,0 +1,59 @@ +name: fate-10000 +namespace: fate-10000 +chartName: fate +chartVersion: v1.11.1 +partyId: 10000 +registry: "" +pullPolicy: +imagePullSecrets: +- name: myregistrykey +persistence: false +istio: + enabled: false +podSecurityPolicy: + enabled: false +ingressClassName: nginx +modules: + - rollsite + - clustermanager + - nodemanager + - mysql + - python + - fateboard + - client + +computing: Eggroll +federation: Eggroll +storage: Eggroll +algorithm: NN +device: GPU + +ingress: + fateboard: + hosts: + - name: party10000.fateboard.example.com + client: + hosts: + - name: party10000.notebook.example.com + +rollsite: + type: NodePort + nodePort: 30101 + partyList: + - partyId: 9999 + partyIp: 192.168.9.1 + partyPort: 30091 + +python: + type: NodePort + httpNodePort: 30107 + grpcNodePort: 30102 + logLevel: INFO + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + +servingIp: 192.168.10.1 +servingPort: 30105 diff --git a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml index df69d5caf..71dc3da91 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml index e798aadbe..d153fd64a 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml index d1d4028a8..c89bbf221 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster.yaml b/k8s-deploy/examples/party-10000/cluster.yaml index fc90e7033..236111484 100644 --- a/k8s-deploy/examples/party-10000/cluster.yaml +++ b/k8s-deploy/examples/party-10000/cluster.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-gpu.yaml b/k8s-deploy/examples/party-9999/cluster-gpu.yaml new file mode 100644 index 000000000..4f1996e2a --- /dev/null +++ b/k8s-deploy/examples/party-9999/cluster-gpu.yaml @@ -0,0 +1,59 @@ +name: fate-9999 +namespace: fate-9999 +chartName: fate +chartVersion: v1.11.1 +partyId: 9999 +registry: "" +pullPolicy: +imagePullSecrets: +- name: myregistrykey +persistence: false +istio: + enabled: false +podSecurityPolicy: + enabled: false +ingressClassName: nginx +modules: + - rollsite + - clustermanager + - nodemanager + - mysql + - python + - fateboard + - client + +computing: Eggroll +federation: Eggroll +storage: Eggroll +algorithm: NN +device: GPU + +ingress: + fateboard: + hosts: + - name: party9999.fateboard.example.com + client: + hosts: + - name: party9999.notebook.example.com + +rollsite: + type: NodePort + nodePort: 30091 + partyList: + - partyId: 10000 + partyIp: 192.168.10.1 + partyPort: 30101 + +python: + type: NodePort + httpNodePort: 30097 + grpcNodePort: 30092 + logLevel: INFO + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + +servingIp: 192.168.9.1 +servingPort: 30095 diff --git a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml index 8d83d6099..468ff102d 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml index 4c4912b76..b075dd1f1 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml index a744eee42..7de974510 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster.yaml b/k8s-deploy/examples/party-9999/cluster.yaml index 69f6fc00b..a63b7e972 100644 --- a/k8s-deploy/examples/party-9999/cluster.yaml +++ b/k8s-deploy/examples/party-9999/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/rollsite.yaml b/k8s-deploy/examples/party-exchange/rollsite.yaml index d67a60fb8..74200191a 100644 --- a/k8s-deploy/examples/party-exchange/rollsite.yaml +++ b/k8s-deploy/examples/party-exchange/rollsite.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/trafficServer.yaml b/k8s-deploy/examples/party-exchange/trafficServer.yaml index c497cd02e..a26587233 100644 --- a/k8s-deploy/examples/party-exchange/trafficServer.yaml +++ b/k8s-deploy/examples/party-exchange/trafficServer.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.10.0 +chartVersion: v1.11.1 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party.config b/k8s-deploy/examples/party.config index 89fc3c3ec..83762449a 100644 --- a/k8s-deploy/examples/party.config +++ b/k8s-deploy/examples/party.config @@ -1,10 +1,7 @@ -fate_chartVersion=v1.10.0 -fate_imageTAG=1.10.0-release +fate_chartVersion=v1.11.1 +fate_imageTAG=1.11.1-release fate_serving_chartVersion=v2.1.6 fate_serving_imageTAG=2.1.6-release party_9999_IP=192.168.9.1 party_10000_IP=192.168.10.1 party_exchange_IP=192.168.0.1 - -algorithm=Basic -device=CPU \ No newline at end of file