From 1a604a2982f20770b10df725986b6eeaa9fad5ed Mon Sep 17 00:00:00 2001
From: Brent Langston <blangs@amazon.com>
Date: Fri, 1 Feb 2019 21:32:40 +0000
Subject: [PATCH] initial commit

---
 .dockerignore       |   1 +
 .gitignore          |   1 +
 .travis.yml         |   5 +
 Dockerfile          | 108 ++++++++++++
 README.md           |  37 +++++
 bashrc              |  50 ++++++
 build.sh            |  12 ++
 cluster-cpu-gpu.yml | 391 ++++++++++++++++++++++++++++++++++++++++++++
 gpu-1-taskdef.json  |  28 ++++
 gpu-4-taskdef.json  |  38 +++++
 gpu-8-taskdef.json  |  38 +++++
 push.sh             |  14 ++
 task-cpu.yml        |  70 ++++++++
 13 files changed, 793 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .gitignore
 create mode 100644 .travis.yml
 create mode 100644 Dockerfile
 create mode 100644 README.md
 create mode 100644 bashrc
 create mode 100755 build.sh
 create mode 100644 cluster-cpu-gpu.yml
 create mode 100644 gpu-1-taskdef.json
 create mode 100644 gpu-4-taskdef.json
 create mode 100644 gpu-8-taskdef.json
 create mode 100755 push.sh
 create mode 100644 task-cpu.yml

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..6da4faa
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+fargate/*
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6da4faa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+fargate/*
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8f2625a
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,5 @@
+services:
+  - docker
+
+script:
+- ./build.sh && ./push.sh
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..7bb6ff2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG PYTHON=python3
+
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+COPY bashrc /etc/bash.bashrc
+
+# Pick up some TF dependencies
+RUN chmod a+rx /etc/bash.bashrc \
+        && apt-get update && apt-get install -y --no-install-recommends \
+        git \
+        time \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        curl \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        && apt-get update \
+        && apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \
+        && apt-get update \
+        && apt-get install libnvinfer4=4.1.2-1+cuda9.0 \
+        && apt-get update && apt-get install -y \
+            ${PYTHON} \
+            ${PYTHON}-pip \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/* \
+        && ln -s $(which ${PYTHON}) /usr/local/bin/python # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+        
+RUN git clone https://github.com/tensorflow/benchmarks.git \
+        && cd /benchmarks/ \
+        && git checkout cnn_tf_v1.9_compatible
+
+WORKDIR /benchmarks/scripts/tf_cnn_benchmarks/
+CMD time python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --data_format=NHWC --device=cpu --summary_verbosity=1
+
+FROM base AS tensorflow-cpu
+ARG PIP=pip3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN     ${PIP} install --no-cache-dir ${TF_PACKAGE}
+
+FROM base AS tensorflow-gpu
+ARG PIP=pip3
+
+ENV GPU=1
+ENV BATCH_SIZE=32
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow-gpu
+RUN     ${PIP} install --no-cache-dir ${TF_PACKAGE}
+
+WORKDIR /benchmarks/scripts/tf_cnn_benchmarks/
+CMD time python tf_cnn_benchmarks.py --num_gpus=$GPU --batch_size=$BATCH_SIZE --model=resnet50 --variable_update=parameter_server --data_format=NHWC --device=gpu --summary_verbosity=1
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..00d7cbf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,37 @@
+[![Build Status](https://travis-ci.org/brentley/tensorflow-container.svg?branch=master)](https://travis-ci.org/brentley/tensorflow-container)
+
+This is a sample tensorflow container used to test cpu and gpu support on ECS.
+
+See corresponding blog post: 
+
+```
+export PATH=$HOME/.local/bin:$HOME/bin:$PATH >> ~/.bash_profile
+source ~/.bash_profile
+pip install --user -U awscli
+```
+
+```
+aws cloudformation deploy --stack-name tensorflow-test --template-file cluster-cpu-gpu.yml --capabilities CAPABILITY_IAM                            
+aws cloudformation deploy --stack-name tensorflow-cpu-taskdef --template-file task-cpu.yml
+aws ecs register-task-definition --cli-input-json file://gpu-1-taskdef.json
+
+```
+
+```
+export cluster=$(aws cloudformation describe-stacks --stack-name tensorflow-test --query 'Stacks[0].Outputs[?OutputKey==`ClusterName`].OutputValue' --output text) 
+echo $cluster
+```
+
+```
+aws ecs run-task --cluster $cluster --task-definition tensorflow-cpu
+aws ecs run-task --cluster $cluster --task-definition tensorflow-gpu
+```
+
+```
+aws cloudformation deploy --stack-name tensorflow-test --template-file cluster-cpu-gpu.yml --parameter-overrides GPUInstanceType=p3.16xlarge --capabilities CAPABILITY_IAM
+```
+
+```
+aws ecs register-task-definition --cli-input-json file://gpu-4-taskdef.json
+aws ecs register-task-definition --cli-input-json file://gpu-8-taskdef.json
+```
\ No newline at end of file
diff --git a/bashrc b/bashrc
new file mode 100644
index 0000000..40f0927
--- /dev/null
+++ b/bashrc
@@ -0,0 +1,50 @@
+opyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
+export TERM=xterm-256color
+alias grep="grep --color=auto"
+alias ls="ls --color=auto"
+
+echo -e "\e[1;31m"
+cat<<TF
+________                               _______________                
+___  __/__________________________________  ____/__  /________      __
+__  /  _  _ \_  __ \_  ___/  __ \_  ___/_  /_   __  /_  __ \_ | /| / /
+_  /   /  __/  / / /(__  )/ /_/ /  /   _  __/   _  / / /_/ /_ |/ |/ / 
+/_/    \___//_/ /_//____/ \____//_/    /_/      /_/  \____/____/|__/
+
+TF
+echo -e "\e[0;33m"
+
+if [[ $EUID -eq 0 ]]; then
+  cat <<WARN
+WARNING: You are running this container as root, which can cause new files in
+mounted volumes to be created as the root user on your host machine.
+
+To avoid this, run the container by specifying your user's userid:
+
+$ docker run -u \$(id -u):\$(id -g) args...
+WARN
+else
+  cat <<EXPL
+You are running this container as user with ID $(id -u) and group $(id -g),
+which should map to the ID and group for your user on the Docker host. Great!
+EXPL
+fi
+
+# Turn off colors
+echo -e "\e[m"
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..5b9f606
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+
+DOCKER_ID=${DOCKER_USERNAME}
+GIT_TAG=$(git rev-parse --short HEAD)
+
+for i in cpu gpu
+do
+docker build -t ${DOCKER_ID}/tensorflow-${i}:${GIT_TAG} --target tensorflow-${i} .
+docker tag ${DOCKER_ID}/tensorflow-${i}:${GIT_TAG} ${DOCKER_ID}/tensorflow-${i}:latest
+done
diff --git a/cluster-cpu-gpu.yml b/cluster-cpu-gpu.yml
new file mode 100644
index 0000000..aaca78a
--- /dev/null
+++ b/cluster-cpu-gpu.yml
@@ -0,0 +1,391 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: EC2 ECS cluster running containers in a public subnet. Only supports
+             public facing load balancer, and public service discovery namespaces.
+Parameters:
+  EnvironmentName:
+    Type: String
+    Default: test
+    Description: "A friendly environment name that will be used for namespacing all cluster resources. Example: staging, qa, or production"
+  InstanceType:
+    Description: EC2 instance type
+    Type: String
+    Default: c5.2xlarge
+    Description: Class of EC2 instance used to host containers. Choose t2 for testing, m5 for general purpose, c5 for CPU intensive services, and r5 for memory intensive services
+    AllowedValues: [ t2.micro, t2.small, t2.medium, t2.large, t2.xlarge, t2.2xlarge,
+     m5.large, m5.xlarge, m5.2large, m5.4xlarge, m5.12xlarge, m5.24large,
+     c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge, c5.18xlarge,
+     r5.large, r5.xlarge, r5.2xlarge, r5.4xlarge, r5.12xlarge, r5.24xlarge ]
+    ConstraintDescription: Please choose a valid instance type.
+  GPUInstanceType:
+    Description: EC2 GPU instance type
+    Type: String
+    Default: p3.2xlarge
+    Description: Class of EC2 instance used to host containers. Choose t2 for testing, m5 for general purpose, c5 for CPU intensive services, and r5 for memory intensive services
+    AllowedValues: [ p2.xlarge, p2.8xlarge, p2.16xlarge, p3.2xlarge, p3.8xlarge, p3.16xlarge, p3dn.24xlarge ]
+    ConstraintDescription: Please choose a valid instance type.
+  DesiredCapacity:
+    Type: Number
+    Default: '2'
+    Description: Number of EC2 instances to launch in your ECS cluster.
+  DesiredGPUCapacity:
+    Type: Number
+    Default: '2'
+    Description: Number of EC2 instances to launch in your ECS cluster.
+  MaxSize:
+    Type: Number
+    Default: '2'
+    Description: Maximum number of EC2 instances that can be launched in your ECS cluster.
+  MaxGPUSize:
+    Type: Number
+    Default: '2'
+    Description: Maximum number of EC2 instances that can be launched in your ECS cluster.
+  ECSAMI:
+    Description: AMI ID
+    Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
+    Default: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id
+    Description: The Amazon Machine Image ID used for the cluster, leave it as the default value to get the latest AMI
+  ECSGPUAMI:
+    Description: GPU AMI ID
+    Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
+    Default: /aws/service/ecs/optimized-ami/amazon-linux-2/gpu/recommended/image_id 
+    Description: The Amazon Machine Image ID used for the gpu instances, leave it as the default value to get the latest AMI
+
+Mappings:
+  # Hard values for the subnet masks. These masks define
+  # the range of internal IP addresses that can be assigned.
+  # The VPC can have all IP's from 10.0.0.0 to 10.0.255.255
+  # There are two subnets which cover the ranges:
+  #
+  # 10.0.0.0 - 10.0.0.255
+  # 10.0.1.0 - 10.0.1.255
+  #
+  # If you need more IP addresses (perhaps you have so many
+  # instances that you run out) then you can customize these
+  # ranges to add more
+  SubnetConfig:
+    VPC:
+      CIDR: '10.0.0.0/16'
+    PublicOne:
+      CIDR: '10.0.0.0/24'
+    PublicTwo:
+      CIDR: '10.0.1.0/24'
+Resources:
+  # VPC in which containers will be networked.
+  # It has two public subnets
+  # We distribute the subnets across the first two available subnets
+  # for the region, for high availability.
+  VPC:
+    Type: AWS::EC2::VPC
+    Properties:
+      EnableDnsSupport: true
+      EnableDnsHostnames: true
+      CidrBlock: !FindInMap ['SubnetConfig', 'VPC', 'CIDR']
+
+  # Two public subnets, where containers can have public IP addresses
+  PublicSubnetOne:
+    Type: AWS::EC2::Subnet
+    Properties:
+      AvailabilityZone:
+         Fn::Select:
+         - 0
+         - Fn::GetAZs: {Ref: 'AWS::Region'}
+      VpcId: !Ref 'VPC'
+      CidrBlock: !FindInMap ['SubnetConfig', 'PublicOne', 'CIDR']
+      MapPublicIpOnLaunch: true
+  PublicSubnetTwo:
+    Type: AWS::EC2::Subnet
+    Properties:
+      AvailabilityZone:
+         Fn::Select:
+         - 1
+         - Fn::GetAZs: {Ref: 'AWS::Region'}
+      VpcId: !Ref 'VPC'
+      CidrBlock: !FindInMap ['SubnetConfig', 'PublicTwo', 'CIDR']
+      MapPublicIpOnLaunch: true
+
+  # Setup networking resources for the public subnets. Containers
+  # in the public subnets have public IP addresses and the routing table
+  # sends network traffic via the internet gateway.
+  InternetGateway:
+    Type: AWS::EC2::InternetGateway
+  GatewayAttachement:
+    Type: AWS::EC2::VPCGatewayAttachment
+    Properties:
+      VpcId: !Ref 'VPC'
+      InternetGatewayId: !Ref 'InternetGateway'
+  PublicRouteTable:
+    Type: AWS::EC2::RouteTable
+    Properties:
+      VpcId: !Ref 'VPC'
+  PublicRoute:
+    Type: AWS::EC2::Route
+    DependsOn: GatewayAttachement
+    Properties:
+      RouteTableId: !Ref 'PublicRouteTable'
+      DestinationCidrBlock: '0.0.0.0/0'
+      GatewayId: !Ref 'InternetGateway'
+  PublicSubnetOneRouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Properties:
+      SubnetId: !Ref PublicSubnetOne
+      RouteTableId: !Ref PublicRouteTable
+  PublicSubnetTwoRouteTableAssociation:
+    Type: AWS::EC2::SubnetRouteTableAssociation
+    Properties:
+      SubnetId: !Ref PublicSubnetTwo
+      RouteTableId: !Ref PublicRouteTable
+
+  # ECS Resources
+  ECSCluster:
+    Type: AWS::ECS::Cluster
+
+  # A security group for the EC2 hosts that will run the containers.
+  # Rules will be added depending on what ingress is created.
+  ContainerSecurityGroup:
+    Type: AWS::EC2::SecurityGroup
+    Properties:
+      GroupDescription: Access to the ECS hosts that run containers
+      VpcId: !Ref 'VPC'
+
+  # Autoscaling group. This launches the actual EC2 instances that will register
+  # themselves as members of the cluster, and run the docker containers.
+  ECSAutoScalingGroup:
+    Type: AWS::AutoScaling::AutoScalingGroup
+    Properties:
+      VPCZoneIdentifier:
+        - !Ref PublicSubnetOne
+        - !Ref PublicSubnetTwo
+      LaunchConfigurationName: !Ref 'ContainerInstances'
+      MinSize: '1'
+      MaxSize: !Ref 'MaxSize'
+      DesiredCapacity: !Ref 'DesiredCapacity'
+    CreationPolicy:
+      ResourceSignal:
+        Timeout: PT15M
+    UpdatePolicy:
+      AutoScalingReplacingUpdate:
+        WillReplace: 'true'
+  ContainerInstances:
+    Type: AWS::AutoScaling::LaunchConfiguration
+    Properties:
+      ImageId: !Ref 'ECSAMI'
+      SecurityGroups: [!Ref 'ContainerSecurityGroup']
+      InstanceType: !Ref 'InstanceType'
+      IamInstanceProfile: !Ref 'EC2InstanceProfile'
+      UserData:
+        Fn::Base64: !Sub |
+          #!/bin/bash -xe
+          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          yum install -y aws-cfn-bootstrap
+          /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
+  EC2InstanceProfile:
+    Type: AWS::IAM::InstanceProfile
+    Properties:
+      Path: /
+      Roles: [!Ref 'EC2Role']
+
+  # Autoscaling group for gpu instances. This launches the actual EC2 instances that will register
+  # themselves as members of the cluster, and run the docker containers.
+  ECSGPUAutoScalingGroup:
+    Type: AWS::AutoScaling::AutoScalingGroup
+    Properties:
+      VPCZoneIdentifier:
+        - !Ref PublicSubnetOne
+        - !Ref PublicSubnetTwo
+      LaunchConfigurationName: !Ref 'GPUContainerInstances'
+      MinSize: '1'
+      MaxSize: !Ref 'MaxGPUSize'
+      DesiredCapacity: !Ref 'DesiredGPUCapacity'
+    CreationPolicy:
+      ResourceSignal:
+        Timeout: PT15M
+    UpdatePolicy:
+      AutoScalingReplacingUpdate:
+        WillReplace: 'true'
+  GPUContainerInstances:
+    Type: AWS::AutoScaling::LaunchConfiguration
+    Properties:
+      ImageId: !Ref 'ECSGPUAMI'
+      SecurityGroups: [!Ref 'ContainerSecurityGroup']
+      InstanceType: !Ref 'GPUInstanceType'
+      IamInstanceProfile: !Ref 'EC2InstanceProfile'
+      UserData:
+        Fn::Base64: !Sub |
+          #!/bin/bash -xe
+          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          yum install -y aws-cfn-bootstrap
+          /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSGPUAutoScalingGroup --region ${AWS::Region}
+  GPUEC2InstanceProfile:
+    Type: AWS::IAM::InstanceProfile
+    Properties:
+      Path: /
+      Roles: [!Ref 'EC2Role']
+
+  # A role used to allow AWS Autoscaling to inspect stats and adjust scaleable targets
+  # on your AWS account
+  AutoscalingRole:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+        - Effect: Allow
+          Principal:
+            Service: [application-autoscaling.amazonaws.com]
+          Action: ['sts:AssumeRole']
+      Path: /
+      Policies:
+      - PolicyName: service-autoscaling
+        PolicyDocument:
+          Statement:
+          - Effect: Allow
+            Action:
+              - 'application-autoscaling:*'
+              - 'cloudwatch:DescribeAlarms'
+              - 'cloudwatch:PutMetricAlarm'
+              - 'ecs:DescribeServices'
+              - 'ecs:UpdateService'
+            Resource: '*'
+
+  # Role for the EC2 hosts. This allows the ECS agent on the EC2 hosts
+  # to communciate with the ECS control plane, as well as download the docker
+  # images from ECR to run on your host.
+  EC2Role:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+        - Effect: Allow
+          Principal:
+            Service: [ec2.amazonaws.com]
+          Action: ['sts:AssumeRole']
+      Path: /
+      Policies:
+      - PolicyName: ecs-service
+        PolicyDocument:
+          Statement:
+          - Effect: Allow
+            Action:
+              - 'ecs:CreateCluster'
+              - 'ecs:DeregisterContainerInstance'
+              - 'ecs:DiscoverPollEndpoint'
+              - 'ecs:Poll'
+              - 'ecs:RegisterContainerInstance'
+              - 'ecs:StartTelemetrySession'
+              - 'ecs:Submit*'
+              - 'logs:CreateLogStream'
+              - 'logs:PutLogEvents'
+              - 'ecr:GetAuthorizationToken'
+              - 'ecr:BatchGetImage'
+              - 'ecr:GetDownloadUrlForLayer'
+            Resource: '*'
+
+  # This is an IAM role which authorizes ECS to manage resources on your
+  # account on your behalf, such as updating your load balancer with the
+  # details of where your containers are, so that traffic can reach your
+  # containers.
+  ECSRole:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+        - Effect: Allow
+          Principal:
+            Service: [ecs.amazonaws.com]
+          Action: ['sts:AssumeRole']
+      Path: /
+      Policies:
+      - PolicyName: ecs-service
+        PolicyDocument:
+          Statement:
+          - Effect: Allow
+            Action:
+              # Rules which allow ECS to attach network interfaces to instances
+              # on your behalf in order for awsvpc networking mode to work right
+              - 'ec2:AttachNetworkInterface'
+              - 'ec2:CreateNetworkInterface'
+              - 'ec2:CreateNetworkInterfacePermission'
+              - 'ec2:DeleteNetworkInterface'
+              - 'ec2:DeleteNetworkInterfacePermission'
+              - 'ec2:Describe*'
+              - 'ec2:DetachNetworkInterface'
+
+              # Rules which allow ECS to update load balancers on your behalf
+              # with the information sabout how to send traffic to your containers
+              - 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer'
+              - 'elasticloadbalancing:DeregisterTargets'
+              - 'elasticloadbalancing:Describe*'
+              - 'elasticloadbalancing:RegisterInstancesWithLoadBalancer'
+              - 'elasticloadbalancing:RegisterTargets'
+            Resource: '*'
+  # This is a role which is used by the ECS tasks themselves.
+  ECSTaskExecutionRole:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+        - Effect: Allow
+          Principal:
+            Service: [ecs-tasks.amazonaws.com]
+          Action: ['sts:AssumeRole']
+      Path: /
+      Policies:
+        - PolicyName: AmazonECSTaskExecutionRolePolicy
+          PolicyDocument:
+            Statement:
+            - Effect: Allow
+              Action:
+                # Allow the ECS Tasks to download images from ECR
+                - 'ecr:GetAuthorizationToken'
+                - 'ecr:BatchCheckLayerAvailability'
+                - 'ecr:GetDownloadUrlForLayer'
+                - 'ecr:BatchGetImage'
+
+                # Allow the ECS tasks to upload logs to CloudWatch
+                - 'logs:CreateLogStream'
+                - 'logs:PutLogEvents'
+              Resource: '*'
+              
+# These are the values output by the CloudFormation template. Be careful
+# about changing any of them, because of them are exported with specific
+# names so that the other task related CF templates can use them.
+Outputs:
+  ClusterName:
+    Description: The name of the ECS cluster
+    Value: !Ref 'ECSCluster'
+    Export:
+      Name: !Sub ${EnvironmentName}:ClusterName
+  AutoscalingRole:
+    Description: The ARN of the role used for autoscaling
+    Value: !GetAtt 'AutoscalingRole.Arn'
+    Export:
+      Name: !Sub ${EnvironmentName}:AutoscalingRole
+  ECSRole:
+    Description: The ARN of the ECS role
+    Value: !GetAtt 'ECSRole.Arn'
+    Export:
+      Name: !Sub ${EnvironmentName}:ECSRole
+  ECSTaskExecutionRole:
+    Description: The ARN of the ECS role
+    Value: !GetAtt 'ECSTaskExecutionRole.Arn'
+    Export:
+      Name: !Sub ${EnvironmentName}:ECSTaskExecutionRole
+  VpcId:
+    Description: The ID of the VPC that this stack is deployed in
+    Value: !Ref 'VPC'
+    Export:
+      Name: !Sub ${EnvironmentName}:VpcId
+  PublicSubnetOne:
+    Description: Public subnet one
+    Value: !Ref 'PublicSubnetOne'
+    Export:
+      Name: !Sub ${EnvironmentName}:PublicSubnetOne
+  PublicSubnetTwo:
+    Description: Public subnet two
+    Value: !Ref 'PublicSubnetTwo'
+    Export:
+      Name: !Sub ${EnvironmentName}:PublicSubnetTwo
+  ContainerSecurityGroup:
+    Description: A security group used to allow containers to receive traffic
+    Value: !Ref 'ContainerSecurityGroup'
+    Export:
+      Name: !Sub ${EnvironmentName}:ContainerSecurityGroup
diff --git a/gpu-1-taskdef.json b/gpu-1-taskdef.json
new file mode 100644
index 0000000..0e42f31
--- /dev/null
+++ b/gpu-1-taskdef.json
@@ -0,0 +1,28 @@
+{
+    "containerDefinitions": [
+        {
+            "logConfiguration": {
+                "logDriver": "awslogs",
+                "options": {
+                    "awslogs-group": "test-task-tensorflow-gpu",
+                    "awslogs-region": "us-east-2",
+                    "awslogs-stream-prefix": "tensorflow-gpu"
+                }
+            },
+            "cpu": 1024,
+            "resourceRequirements": [
+                {
+                    "type": "GPU",
+                    "value": "1"
+                }
+            ],
+            "image": "brentley/tensorflow-gpu:latest",
+            "essential": true,
+            "name": "tensorflow-gpu"
+        }
+    ],
+    "memory": "6144",
+    "family": "tensorflow-1-gpu",
+    "cpu": "1024",
+    "placementConstraints": []
+}
diff --git a/gpu-4-taskdef.json b/gpu-4-taskdef.json
new file mode 100644
index 0000000..d8db014
--- /dev/null
+++ b/gpu-4-taskdef.json
@@ -0,0 +1,38 @@
+{
+    "containerDefinitions": [
+        {
+            "logConfiguration": {
+                "logDriver": "awslogs",
+                "options": {
+                    "awslogs-group": "test-task-tensorflow-gpu",
+                    "awslogs-region": "us-east-2",
+                    "awslogs-stream-prefix": "tensorflow-gpu"
+                }
+            },
+            "cpu": 1024,
+            "resourceRequirements": [
+                {
+                    "type": "GPU",
+                    "value": "4"
+                }
+            ],
+            "environment": [
+                {
+                    "name": "GPU",
+                    "value": "4"
+                },
+                {
+                    "name": "BATCH_SIZE",
+                    "value": "128"
+                }
+            ],
+            "image": "brentley/tensorflow-gpu:latest",
+            "essential": true,
+            "name": "tensorflow-gpu"
+        }
+    ],
+    "memory": "8192",
+    "family": "tensorflow-4-gpu",
+    "cpu": "1024",
+    "placementConstraints": []
+}
diff --git a/gpu-8-taskdef.json b/gpu-8-taskdef.json
new file mode 100644
index 0000000..9ca5820
--- /dev/null
+++ b/gpu-8-taskdef.json
@@ -0,0 +1,38 @@
+{
+    "containerDefinitions": [
+        {
+            "logConfiguration": {
+                "logDriver": "awslogs",
+                "options": {
+                    "awslogs-group": "test-task-tensorflow-gpu",
+                    "awslogs-region": "us-east-2",
+                    "awslogs-stream-prefix": "tensorflow-gpu"
+                }
+            },
+            "cpu": 1024,
+            "resourceRequirements": [
+                {
+                    "type": "GPU",
+                    "value": "8"
+                }
+            ],
+            "environment": [
+                {
+                    "name": "GPU",
+                    "value": "8"
+                },
+                {
+                    "name": "BATCH_SIZE",
+                    "value": "128"
+                }
+            ],
+            "image": "brentley/tensorflow-gpu:latest",
+            "essential": true,
+            "name": "tensorflow-gpu"
+        }
+    ],
+    "memory": "131072",
+    "family": "tensorflow-8-gpu",
+    "cpu": "1024",
+    "placementConstraints": []
+}
diff --git a/push.sh b/push.sh
new file mode 100755
index 0000000..08768f4
--- /dev/null
+++ b/push.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+DOCKER_ID=${DOCKER_USERNAME}
+GIT_TAG=$(git rev-parse --short HEAD)
+
+echo "${DOCKER_PASSWORD}" | docker login -u "${DOCKER_USERNAME}" --password-stdin
+
+for i in cpu gpu
+do
+docker push ${DOCKER_ID}/tensorflow-${i}:${GIT_TAG}
+docker push ${DOCKER_ID}/tensorflow-${i}:latest
+done
diff --git a/task-cpu.yml b/task-cpu.yml
new file mode 100644
index 0000000..1df160e
--- /dev/null
+++ b/task-cpu.yml
@@ -0,0 +1,70 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: Deploy a task into an ECS cluster 
+Parameters:
+  EnvironmentName:
+    Type: String
+    Default: test
+    Description: The name of the environment to add this task to
+  TaskName:
+    Type: String
+    Default: tensorflow-cpu
+    Description: A name for the task
+  ImageUrl:
+    Type: String
+    Default: brentley/tensorflow-cpu:latest
+    Description: The url of a docker image
+  ContainerCpu:
+    Type: Number
+    Default: 1024
+    Description: How much CPU to give the container. 1024 is 1 CPU
+  ContainerMemory:
+    Type: Number
+    Default: 6144
+    Description: How much memory in megabytes to give the container
+  DesiredCount:
+    Type: Number
+    Default: 1
+    Description: How many copies of the task to run
+  Role:
+    Type: String
+    Default: ""
+
+Conditions:
+  HasCustomRole: !Not [ !Equals [!Ref 'Role', ''] ]
+
+Resources:
+  # A log group for storing the stdout logs from this task's containers
+  LogGroup:
+    Type: AWS::Logs::LogGroup
+    Properties:
+      LogGroupName: !Sub ${EnvironmentName}-task-${TaskName}
+
+  # A log group for the gpu container
+  GPULogGroup:
+    Type: AWS::Logs::LogGroup
+    Properties:
+      LogGroupName: !Sub ${EnvironmentName}-task-tensorflow-gpu
+      
+  # The task definition. This is a simple metadata description of what
+  # container to run, and what resource requirements it has.
+  TaskDefinition:
+    Type: AWS::ECS::TaskDefinition
+    Properties:
+      Family: !Ref 'TaskName'
+      Cpu: !Ref 'ContainerCpu'
+      Memory: !Ref 'ContainerMemory'
+      TaskRoleArn:
+        Fn::If:
+          - 'HasCustomRole'
+          - !Ref 'Role'
+          - !Ref "AWS::NoValue"
+      ContainerDefinitions:
+        - Name: !Ref 'TaskName'
+          Image: !Ref 'ImageUrl'
+          LogConfiguration:
+            LogDriver: 'awslogs'
+            Options:
+              awslogs-group: !Sub ${EnvironmentName}-task-${TaskName}
+              awslogs-region: !Ref 'AWS::Region'
+              awslogs-stream-prefix: !Ref 'TaskName'
+