From 1a604a2982f20770b10df725986b6eeaa9fad5ed Mon Sep 17 00:00:00 2001 From: Brent Langston Date: Fri, 1 Feb 2019 21:32:40 +0000 Subject: [PATCH] initial commit --- .dockerignore | 1 + .gitignore | 1 + .travis.yml | 5 + Dockerfile | 108 ++++++++++++ README.md | 37 +++++ bashrc | 50 ++++++ build.sh | 12 ++ cluster-cpu-gpu.yml | 391 ++++++++++++++++++++++++++++++++++++++++++++ gpu-1-taskdef.json | 28 ++++ gpu-4-taskdef.json | 38 +++++ gpu-8-taskdef.json | 38 +++++ push.sh | 14 ++ task-cpu.yml | 70 ++++++++ 13 files changed, 793 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 bashrc create mode 100755 build.sh create mode 100644 cluster-cpu-gpu.yml create mode 100644 gpu-1-taskdef.json create mode 100644 gpu-4-taskdef.json create mode 100644 gpu-8-taskdef.json create mode 100755 push.sh create mode 100644 task-cpu.yml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6da4faa --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +fargate/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6da4faa --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +fargate/* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8f2625a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +services: + - docker + +script: +- ./build.sh && ./push.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7bb6ff2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,108 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile + +ARG UBUNTU_VERSION=16.04 + +FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base + +# For CUDA profiling, TensorFlow requires CUPTI. +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH + +ARG PYTHON=python3 + +ENV TF_NEED_CUDA 1 +ENV TF_NEED_TENSORRT 1 +ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0 +ENV TF_CUDA_VERSION=9.0 +ENV TF_CUDNN_VERSION=7 + +# NCCL 2.x +ENV TF_NCCL_VERSION=2 + +# See http://bugs.python.org/issue19846 +ENV LANG C.UTF-8 + +COPY bashrc /etc/bash.bashrc + +# Pick up some TF dependencies +RUN chmod a+rx /etc/bash.bashrc \ + && apt-get update && apt-get install -y --no-install-recommends \ + git \ + time \ + build-essential \ + cuda-command-line-tools-9-0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + curl \ + libcudnn7=7.2.1.38-1+cuda9.0 \ + libnccl2=2.2.13-1+cuda9.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + rsync \ + software-properties-common \ + unzip \ + && apt-get update \ + && apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \ + && apt-get update \ + && apt-get install libnvinfer4=4.1.2-1+cuda9.0 \ + && apt-get update && apt-get install -y \ + ${PYTHON} \ + ${PYTHON}-pip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && ln -s $(which ${PYTHON}) /usr/local/bin/python # Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +RUN git clone https://github.com/tensorflow/benchmarks.git \ + && cd /benchmarks/ \ + && git checkout cnn_tf_v1.9_compatible + +WORKDIR /benchmarks/scripts/tf_cnn_benchmarks/ +CMD time python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --data_format=NHWC --device=cpu --summary_verbosity=1 + +FROM base AS tensorflow-cpu +ARG PIP=pip3 + +# Options: +# tensorflow +# tensorflow-gpu +# tf-nightly +# tf-nightly-gpu +ARG TF_PACKAGE=tensorflow +RUN ${PIP} install --no-cache-dir ${TF_PACKAGE} + +FROM base AS tensorflow-gpu +ARG PIP=pip3 + +ENV GPU=1 +ENV BATCH_SIZE=32 + +# Options: +# tensorflow +# tensorflow-gpu +# tf-nightly +# tf-nightly-gpu +ARG TF_PACKAGE=tensorflow-gpu +RUN ${PIP} install --no-cache-dir ${TF_PACKAGE} + +WORKDIR /benchmarks/scripts/tf_cnn_benchmarks/ +CMD time python tf_cnn_benchmarks.py --num_gpus=$GPU --batch_size=$BATCH_SIZE --model=resnet50 --variable_update=parameter_server --data_format=NHWC --device=gpu --summary_verbosity=1 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..00d7cbf --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +[![Build Status](https://travis-ci.org/brentley/tensorflow-container.svg?branch=master)](https://travis-ci.org/brentley/tensorflow-container) + +This is a sample tensorflow container used to test cpu and gpu support on ECS. + +See corresponding blog post: + +``` +export PATH=$HOME/.local/bin:$HOME/bin:$PATH >> ~/.bash_profile +source ~/.bash_profile +pip install --user -U awscli +``` + +``` +aws cloudformation deploy --stack-name tensorflow-test --template-file cluster-cpu-gpu.yml --capabilities CAPABILITY_IAM +aws cloudformation deploy --stack-name tensorflow-cpu-taskdef --template-file task-cpu.yml +aws ecs register-task-definition --cli-input-json file://gpu-1-taskdef.json + +``` + +``` +export cluster=$(aws cloudformation describe-stacks --stack-name tensorflow-test --query 'Stacks[0].Outputs[?OutputKey==`ClusterName`].OutputValue' --output text) +echo $cluster +``` + +``` +aws ecs run-task --cluster $cluster --task-definition tensorflow-cpu +aws ecs run-task --cluster $cluster --task-definition tensorflow-gpu +``` + +``` +aws cloudformation deploy --stack-name tensorflow-test --template-file cluster-cpu-gpu.yml --parameter-overrides GPUInstanceType=p3.16xlarge --capabilities CAPABILITY_IAM +``` + +``` +aws ecs register-task-definition --cli-input-json file://gpu-4-taskdef.json +aws ecs register-task-definition --cli-input-json file://gpu-8-taskdef.json +``` \ No newline at end of file diff --git a/bashrc b/bashrc new file mode 100644 index 0000000..40f0927 --- /dev/null +++ b/bashrc @@ -0,0 +1,50 @@ +opyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== + +export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > " +export TERM=xterm-256color +alias grep="grep --color=auto" +alias ls="ls --color=auto" + +echo -e "\e[1;31m" +cat< + Default: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id + Description: The Amazon Machine Image ID used for the cluster, leave it as the default value to get the latest AMI + ECSGPUAMI: + Description: GPU AMI ID + Type: AWS::SSM::Parameter::Value + Default: /aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended/image_id + Description: The Amazon Machine Image ID used for the gpu instances, leave it as the default value to get the latest AMI + +Mappings: + # Hard values for the subnet masks. These masks define + # the range of internal IP addresses that can be assigned. + # The VPC can have all IP's from 10.0.0.0 to 10.0.255.255 + # There are two subnets which cover the ranges: + # + # 10.0.0.0 - 10.0.0.255 + # 10.0.1.0 - 10.0.1.255 + # + # If you need more IP addresses (perhaps you have so many + # instances that you run out) then you can customize these + # ranges to add more + SubnetConfig: + VPC: + CIDR: '10.0.0.0/16' + PublicOne: + CIDR: '10.0.0.0/24' + PublicTwo: + CIDR: '10.0.1.0/24' +Resources: + # VPC in which containers will be networked. + # It has two public subnets + # We distribute the subnets across the first two available subnets + # for the region, for high availability. + VPC: + Type: AWS::EC2::VPC + Properties: + EnableDnsSupport: true + EnableDnsHostnames: true + CidrBlock: !FindInMap ['SubnetConfig', 'VPC', 'CIDR'] + + # Two public subnets, where containers can have public IP addresses + PublicSubnetOne: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: {Ref: 'AWS::Region'} + VpcId: !Ref 'VPC' + CidrBlock: !FindInMap ['SubnetConfig', 'PublicOne', 'CIDR'] + MapPublicIpOnLaunch: true + PublicSubnetTwo: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 1 + - Fn::GetAZs: {Ref: 'AWS::Region'} + VpcId: !Ref 'VPC' + CidrBlock: !FindInMap ['SubnetConfig', 'PublicTwo', 'CIDR'] + MapPublicIpOnLaunch: true + + # Setup networking resources for the public subnets. Containers + # in the public subnets have public IP addresses and the routing table + # sends network traffic via the internet gateway. + InternetGateway: + Type: AWS::EC2::InternetGateway + GatewayAttachement: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + VpcId: !Ref 'VPC' + InternetGatewayId: !Ref 'InternetGateway' + PublicRouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref 'VPC' + PublicRoute: + Type: AWS::EC2::Route + DependsOn: GatewayAttachement + Properties: + RouteTableId: !Ref 'PublicRouteTable' + DestinationCidrBlock: '0.0.0.0/0' + GatewayId: !Ref 'InternetGateway' + PublicSubnetOneRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref PublicSubnetOne + RouteTableId: !Ref PublicRouteTable + PublicSubnetTwoRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref PublicSubnetTwo + RouteTableId: !Ref PublicRouteTable + + # ECS Resources + ECSCluster: + Type: AWS::ECS::Cluster + + # A security group for the EC2 hosts that will run the containers. + # Rules will be added depending on what ingress is created. + ContainerSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: Access to the ECS hosts that run containers + VpcId: !Ref 'VPC' + + # Autoscaling group. This launches the actual EC2 instances that will register + # themselves as members of the cluster, and run the docker containers. + ECSAutoScalingGroup: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + VPCZoneIdentifier: + - !Ref PublicSubnetOne + - !Ref PublicSubnetTwo + LaunchConfigurationName: !Ref 'ContainerInstances' + MinSize: '1' + MaxSize: !Ref 'MaxSize' + DesiredCapacity: !Ref 'DesiredCapacity' + CreationPolicy: + ResourceSignal: + Timeout: PT15M + UpdatePolicy: + AutoScalingReplacingUpdate: + WillReplace: 'true' + ContainerInstances: + Type: AWS::AutoScaling::LaunchConfiguration + Properties: + ImageId: !Ref 'ECSAMI' + SecurityGroups: [!Ref 'ContainerSecurityGroup'] + InstanceType: !Ref 'InstanceType' + IamInstanceProfile: !Ref 'EC2InstanceProfile' + UserData: + Fn::Base64: !Sub | + #!/bin/bash -xe + echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config + yum install -y aws-cfn-bootstrap + /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region} + EC2InstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: [!Ref 'EC2Role'] + + # Autoscaling group for gpu instances. This launches the actual EC2 instances that will register + # themselves as members of the cluster, and run the docker containers. + ECSGPUAutoScalingGroup: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + VPCZoneIdentifier: + - !Ref PublicSubnetOne + - !Ref PublicSubnetTwo + LaunchConfigurationName: !Ref 'GPUContainerInstances' + MinSize: '1' + MaxSize: !Ref 'MaxGPUSize' + DesiredCapacity: !Ref 'DesiredGPUCapacity' + CreationPolicy: + ResourceSignal: + Timeout: PT15M + UpdatePolicy: + AutoScalingReplacingUpdate: + WillReplace: 'true' + GPUContainerInstances: + Type: AWS::AutoScaling::LaunchConfiguration + Properties: + ImageId: !Ref 'ECSGPUAMI' + SecurityGroups: [!Ref 'ContainerSecurityGroup'] + InstanceType: !Ref 'GPUInstanceType' + IamInstanceProfile: !Ref 'EC2InstanceProfile' + UserData: + Fn::Base64: !Sub | + #!/bin/bash -xe + echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config + yum install -y aws-cfn-bootstrap + /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSGPUAutoScalingGroup --region ${AWS::Region} + GPUEC2InstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: [!Ref 'EC2Role'] + + # A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets + # on your AWS account + AutoscalingRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [application-autoscaling.amazonaws.com] + Action: ['sts:AssumeRole'] + Path: / + Policies: + - PolicyName: service-autoscaling + PolicyDocument: + Statement: + - Effect: Allow + Action: + - 'application-autoscaling:*' + - 'cloudwatch:DescribeAlarms' + - 'cloudwatch:PutMetricAlarm' + - 'ecs:DescribeServices' + - 'ecs:UpdateService' + Resource: '*' + + # Role for the EC2 hosts. This allows the ECS agent on the EC2 hosts + # to communciate with the ECS control plane, as well as download the docker + # images from ECR to run on your host. + EC2Role: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ec2.amazonaws.com] + Action: ['sts:AssumeRole'] + Path: / + Policies: + - PolicyName: ecs-service + PolicyDocument: + Statement: + - Effect: Allow + Action: + - 'ecs:CreateCluster' + - 'ecs:DeregisterContainerInstance' + - 'ecs:DiscoverPollEndpoint' + - 'ecs:Poll' + - 'ecs:RegisterContainerInstance' + - 'ecs:StartTelemetrySession' + - 'ecs:Submit*' + - 'logs:CreateLogStream' + - 'logs:PutLogEvents' + - 'ecr:GetAuthorizationToken' + - 'ecr:BatchGetImage' + - 'ecr:GetDownloadUrlForLayer' + Resource: '*' + + # This is an IAM role which authorizes ECS to manage resources on your + # account on your behalf, such as updating your load balancer with the + # details of where your containers are, so that traffic can reach your + # containers. + ECSRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ecs.amazonaws.com] + Action: ['sts:AssumeRole'] + Path: / + Policies: + - PolicyName: ecs-service + PolicyDocument: + Statement: + - Effect: Allow + Action: + # Rules which allow ECS to attach network interfaces to instances + # on your behalf in order for awsvpc networking mode to work right + - 'ec2:AttachNetworkInterface' + - 'ec2:CreateNetworkInterface' + - 'ec2:CreateNetworkInterfacePermission' + - 'ec2:DeleteNetworkInterface' + - 'ec2:DeleteNetworkInterfacePermission' + - 'ec2:Describe*' + - 'ec2:DetachNetworkInterface' + + # Rules which allow ECS to update load balancers on your behalf + # with the information sabout how to send traffic to your containers + - 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer' + - 'elasticloadbalancing:DeregisterTargets' + - 'elasticloadbalancing:Describe*' + - 'elasticloadbalancing:RegisterInstancesWithLoadBalancer' + - 'elasticloadbalancing:RegisterTargets' + Resource: '*' + # This is a role which is used by the ECS tasks themselves. + ECSTaskExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ecs-tasks.amazonaws.com] + Action: ['sts:AssumeRole'] + Path: / + Policies: + - PolicyName: AmazonECSTaskExecutionRolePolicy + PolicyDocument: + Statement: + - Effect: Allow + Action: + # Allow the ECS Tasks to download images from ECR + - 'ecr:GetAuthorizationToken' + - 'ecr:BatchCheckLayerAvailability' + - 'ecr:GetDownloadUrlForLayer' + - 'ecr:BatchGetImage' + + # Allow the ECS tasks to upload logs to CloudWatch + - 'logs:CreateLogStream' + - 'logs:PutLogEvents' + Resource: '*' + +# These are the values output by the CloudFormation template. Be careful +# about changing any of them, because of them are exported with specific +# names so that the other task related CF templates can use them. +Outputs: + ClusterName: + Description: The name of the ECS cluster + Value: !Ref 'ECSCluster' + Export: + Name: !Sub ${EnvironmentName}:ClusterName + AutoscalingRole: + Description: The ARN of the role used for autoscaling + Value: !GetAtt 'AutoscalingRole.Arn' + Export: + Name: !Sub ${EnvironmentName}:AutoscalingRole + ECSRole: + Description: The ARN of the ECS role + Value: !GetAtt 'ECSRole.Arn' + Export: + Name: !Sub ${EnvironmentName}:ECSRole + ECSTaskExecutionRole: + Description: The ARN of the ECS role + Value: !GetAtt 'ECSTaskExecutionRole.Arn' + Export: + Name: !Sub ${EnvironmentName}:ECSTaskExecutionRole + VpcId: + Description: The ID of the VPC that this stack is deployed in + Value: !Ref 'VPC' + Export: + Name: !Sub ${EnvironmentName}:VpcId + PublicSubnetOne: + Description: Public subnet one + Value: !Ref 'PublicSubnetOne' + Export: + Name: !Sub ${EnvironmentName}:PublicSubnetOne + PublicSubnetTwo: + Description: Public subnet two + Value: !Ref 'PublicSubnetTwo' + Export: + Name: !Sub ${EnvironmentName}:PublicSubnetTwo + ContainerSecurityGroup: + Description: A security group used to allow containers to receive traffic + Value: !Ref 'ContainerSecurityGroup' + Export: + Name: !Sub ${EnvironmentName}:ContainerSecurityGroup diff --git a/gpu-1-taskdef.json b/gpu-1-taskdef.json new file mode 100644 index 0000000..0e42f31 --- /dev/null +++ b/gpu-1-taskdef.json @@ -0,0 +1,28 @@ +{ + "containerDefinitions": [ + { + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "test-task-tensorflow-gpu", + "awslogs-region": "us-east-2", + "awslogs-stream-prefix": "tensorflow-gpu" + } + }, + "cpu": 1024, + "resourceRequirements": [ + { + "type": "GPU", + "value": "1" + } + ], + "image": "brentley/tensorflow-gpu:latest", + "essential": true, + "name": "tensorflow-gpu" + } + ], + "memory": "6144", + "family": "tensorflow-1-gpu", + "cpu": "1024", + "placementConstraints": [] +} diff --git a/gpu-4-taskdef.json b/gpu-4-taskdef.json new file mode 100644 index 0000000..d8db014 --- /dev/null +++ b/gpu-4-taskdef.json @@ -0,0 +1,38 @@ +{ + "containerDefinitions": [ + { + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "test-task-tensorflow-gpu", + "awslogs-region": "us-east-2", + "awslogs-stream-prefix": "tensorflow-gpu" + } + }, + "cpu": 1024, + "resourceRequirements": [ + { + "type": "GPU", + "value": "4" + } + ], + "environment": [ + { + "name": "GPU", + "value": "4" + }, + { + "name": "BATCH_SIZE", + "value": "128" + } + ], + "image": "brentley/tensorflow-gpu:latest", + "essential": true, + "name": "tensorflow-gpu" + } + ], + "memory": "8192", + "family": "tensorflow-4-gpu", + "cpu": "1024", + "placementConstraints": [] +} diff --git a/gpu-8-taskdef.json b/gpu-8-taskdef.json new file mode 100644 index 0000000..9ca5820 --- /dev/null +++ b/gpu-8-taskdef.json @@ -0,0 +1,38 @@ +{ + "containerDefinitions": [ + { + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "test-task-tensorflow-gpu", + "awslogs-region": "us-east-2", + "awslogs-stream-prefix": "tensorflow-gpu" + } + }, + "cpu": 1024, + "resourceRequirements": [ + { + "type": "GPU", + "value": "8" + } + ], + "environment": [ + { + "name": "GPU", + "value": "8" + }, + { + "name": "BATCH_SIZE", + "value": "128" + } + ], + "image": "brentley/tensorflow-gpu:latest", + "essential": true, + "name": "tensorflow-gpu" + } + ], + "memory": "131072", + "family": "tensorflow-8-gpu", + "cpu": "1024", + "placementConstraints": [] +} diff --git a/push.sh b/push.sh new file mode 100755 index 0000000..08768f4 --- /dev/null +++ b/push.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e + +DOCKER_ID=${DOCKER_USERNAME} +GIT_TAG=$(git rev-parse --short HEAD) + +echo "${DOCKER_PASSWORD}" | docker login -u "${DOCKER_USERNAME}" --password-stdin + +for i in cpu gpu +do +docker push ${DOCKER_ID}/tensorflow-${i}:${GIT_TAG} +docker push ${DOCKER_ID}/tensorflow-${i}:latest +done diff --git a/task-cpu.yml b/task-cpu.yml new file mode 100644 index 0000000..1df160e --- /dev/null +++ b/task-cpu.yml @@ -0,0 +1,70 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: Deploy a task into an ECS cluster +Parameters: + EnvironmentName: + Type: String + Default: test + Description: The name of the environment to add this task to + TaskName: + Type: String + Default: tensorflow-cpu + Description: A name for the task + ImageUrl: + Type: String + Default: brentley/tensorflow-cpu:latest + Description: The url of a docker image + ContainerCpu: + Type: Number + Default: 1024 + Description: How much CPU to give the container. 1024 is 1 CPU + ContainerMemory: + Type: Number + Default: 6144 + Description: How much memory in megabytes to give the container + DesiredCount: + Type: Number + Default: 1 + Description: How many copies of the task to run + Role: + Type: String + Default: "" + +Conditions: + HasCustomRole: !Not [ !Equals [!Ref 'Role', ''] ] + +Resources: + # A log group for storing the stdout logs from this task's containers + LogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Sub ${EnvironmentName}-task-${TaskName} + + # A log group for the gpu container + GPULogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Sub ${EnvironmentName}-task-tensorflow-gpu + + # The task definition. This is a simple metadata description of what + # container to run, and what resource requirements it has. + TaskDefinition: + Type: AWS::ECS::TaskDefinition + Properties: + Family: !Ref 'TaskName' + Cpu: !Ref 'ContainerCpu' + Memory: !Ref 'ContainerMemory' + TaskRoleArn: + Fn::If: + - 'HasCustomRole' + - !Ref 'Role' + - !Ref "AWS::NoValue" + ContainerDefinitions: + - Name: !Ref 'TaskName' + Image: !Ref 'ImageUrl' + LogConfiguration: + LogDriver: 'awslogs' + Options: + awslogs-group: !Sub ${EnvironmentName}-task-${TaskName} + awslogs-region: !Ref 'AWS::Region' + awslogs-stream-prefix: !Ref 'TaskName' +