Feature: monitor tick (#508)

* Wip * Fix issues * more tweaks * FIx more * More env renaming * Start * Docker runs now * Wip docker run on node * fix issues * More fix * Starts * Starting spark again * Works * Fix * More fixes * Running plugins on the host works * tweak * Fix: tests * Define plugin docs * Added types * Fix jupyterlab * initial commit with grafana and influxdb * changes to find .env files * start refactor into single plugin * remove unused plugins and add required files to resource_mon * make it work with multiple nodes * remove sudo calls and update default dashboard * fix merge issue * updates to make plugins work in container again * remove bad characters from previous checking * Added test for invalid target and target role * Fix pylint * Rename * Added docs for debug plugins * add docs for resource_monitor plugin * surface passwords in metrics plugin config * surface passwords in metrics plugin config p2 * updated comments * initial work for TICK stack * try getting telegraf working * use tvm name as hostname * update run_on to target_role * update sources to only use tick stack * remove unused external port * update start script * docs * PR feedback * update readme with a warning that data is only local * change chronograf port to use 8890 * remove jars * remove unused port * update docs with new port info
Azure · May 1, 2018 · 23c97de · 23c97de
1 parent 779bffb
commit 23c97de
Show file tree

Hide file tree

Showing 14 changed files with 284 additions and 4 deletions.
diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py
@@ -70,8 +70,7 @@ def add_dir(self, path: str, dest: str = None, exclude: List[str] = None):
             relative_folder = os.path.relpath(base, path)
             for file in files:
                 if self._includeFile(file, exclude):
-                    with io.open(os.path.join(base, file), 'r', encoding='UTF-8') as f:
-                        self.zipf.writestr(os.path.join(dest, relative_folder, file), f.read().replace('\r\n', '\n'))
+                    self.add_file(os.path.join(base, file), os.path.join(dest, relative_folder), binary = False)
 
     def _add_custom_scripts(self):
         data = []
@@ -155,7 +154,7 @@ def _add_plugins(self):
         return zipf
 
     def _add_node_scripts(self):
-        self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*'])
+        self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*', '*.png'])
 
     def _includeFile(self, filename: str, exclude: List[str]) -> bool:
         exclude = exclude or []

diff --git a/aztk/models/plugins/internal/plugin_manager.py b/aztk/models/plugins/internal/plugin_manager.py
@@ -18,6 +18,7 @@ class PluginManager:
     plugins = dict(
         jupyter=plugins.JupyterPlugin,
         jupyterlab=plugins.JupyterLabPlugin,
+        resource_monitor=plugins.ResourceMonitorPlugin,
         rstudio_server=plugins.RStudioServerPlugin,
         hdfs=plugins.HDFSPlugin,
         simple=plugins.SimplePlugin,

diff --git a/aztk/node_scripts/install/plugins.py b/aztk/node_scripts/install/plugins.py
@@ -38,8 +38,15 @@ def _plugins_dir():
 
 
 def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):
+
+    print("Loading plugin {} in {} on {}".format(
+        plugin_obj["execute"],
+        plugin_obj['target'],
+        plugin_obj['target_role']
+    ))
+
     if plugin_obj['target'] != target.value:
-        print("Ignoring ", plugin_obj["execute"], " as target is for ", plugin_obj['target'], " but is currently running in ", target.value)
+        print("Ignoring ", plugin_obj["execute"], "as target is for ", plugin_obj['target'], "but is currently running in ", target.value)
         return False
 
     if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True:
@@ -49,6 +56,8 @@ def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):
     if plugin_obj['target_role'] == PluginTargetRole.All.value:
         return True
 
+    print("Ignoring plugin", plugin_obj["execute"], "as target role is ", plugin_obj['target_role'], "and node is master: ", is_master, is_worker)
+
     return False
 
 

diff --git a/aztk/spark/models/plugins/__init__.py b/aztk/spark/models/plugins/__init__.py
@@ -1,6 +1,7 @@
 from .hdfs import HDFSPlugin
 from .jupyter import JupyterPlugin
 from .jupyter_lab import JupyterLabPlugin
+from .resource_monitor import ResourceMonitorPlugin
 from .rstudio_server import RStudioServerPlugin
 from .simple import SimplePlugin
 from .spark_ui_proxy import SparkUIProxyPlugin
diff --git a/aztk/spark/models/plugins/resource_monitor/__init__.py b/aztk/spark/models/plugins/resource_monitor/__init__.py
@@ -0,0 +1 @@
+from .configuration import *
diff --git a/aztk/spark/models/plugins/resource_monitor/configuration.py b/aztk/spark/models/plugins/resource_monitor/configuration.py
@@ -0,0 +1,26 @@
+import os
+from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole
+from aztk.models.plugins.plugin_file import PluginFile
+from aztk.utils import constants
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+class ResourceMonitorPlugin(PluginConfiguration):
+    def __init__(self):
+        super().__init__(
+            name="resource_monitor",
+            ports=[
+                PluginPort(
+                    internal=8890,
+                    public=True,
+                ),
+            ],
+            target=PluginTarget.Host,
+            target_role=PluginTargetRole.All,
+            execute="start_monitor.sh",
+            files=[
+                PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")),
+                PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")),
+                PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")),
+            ]
+        )
diff --git a/aztk/spark/models/plugins/resource_monitor/docker-compose.yml b/aztk/spark/models/plugins/resource_monitor/docker-compose.yml
@@ -0,0 +1,31 @@
+version: '3'
+
+services:
+  # Define an InfluxDB service
+  influxdb:
+    image: influxdb:1.3.5
+    volumes:
+      - ./data/influxdb:/var/lib/influxdb
+    ports:
+      - "8086:8086"
+  # Define a Chronograf service
+  chronograf:
+    image: chronograf:1.3.8
+    environment:
+      INFLUXDB_URL: http://influxdb:8086
+      KAPACITOR_URL: http://kapacitor:9092
+    ports:
+      - "8890:8888"
+    links:
+      - influxdb
+      - kapacitor
+  # Define a Kapacitor service
+  kapacitor:
+    image: kapacitor:1.3.3
+    environment:
+      KAPACITOR_HOSTNAME: kapacitor
+      KAPACITOR_INFLUXDB_0_URLS_0: http://influxdb:8086
+    links:
+      - influxdb
+    ports:
+      - "9092:9092"
diff --git a/aztk/spark/models/plugins/resource_monitor/images/chronograf_build_dashboard.png b/aztk/spark/models/plugins/resource_monitor/images/chronograf_build_dashboard.png
diff --git a/aztk/spark/models/plugins/resource_monitor/images/chronograf_create_dashboard.png b/aztk/spark/models/plugins/resource_monitor/images/chronograf_create_dashboard.png
diff --git a/aztk/spark/models/plugins/resource_monitor/images/chronograf_hosts.png b/aztk/spark/models/plugins/resource_monitor/images/chronograf_hosts.png
diff --git a/aztk/spark/models/plugins/resource_monitor/images/chronograf_single_host.png b/aztk/spark/models/plugins/resource_monitor/images/chronograf_single_host.png
diff --git a/aztk/spark/models/plugins/resource_monitor/readme.md b/aztk/spark/models/plugins/resource_monitor/readme.md
@@ -0,0 +1,61 @@
+# Using the Resrouce Monitor Plugin
+
+The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics.
+
+This plugin takes advanage of the TICK monitoring stack. For more information please visit the [influx data](https://www.influxdata.com/time-series-platform/) web page.
+
+> **IMPORTANT** All of the data is collected on the cluster's master node and will be lost once the cluster is thrown away. To persist data we recommend pushing to an off-cluster InfluxDB instance. Currently there is no supported way to persist the data from this plugin.
+
+## Setup
+
+Update your cluster.yaml file to include the plugin as follows:
+
+```yaml
+...
+
+plugins:
+  - name: resource_monitor
+
+...
+
+
+```
+
+Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded.
+
+```sh
+aztk spark cluster ssh --id <my_cluster>
+```
+
+### Ports
+url | desciption
+--- | ---
+http://localhost:8890 | Cronograf UI
+
+## Visualize data in Chronograf
+
+All data will automatically be published to the InfluxDB. In addition, this plugin also configures Chronograf to ingest the data for queries and visualization. After running **aztk spark cluster ssh --id <cluster_id>** navigate to http://localhost:8888 to open the UI.
+
+
+### Host metrics
+Each node (a.k.a. 'host' in Chronograf) in the cluster will register itself with the InfluxDB and start pushing metrics.
+![Chronograf hosts](./images/chronograf_hosts.png)
+
+Clicking on any individual host will give you more detail and historical metrics.
+
+![Chronograf single host](./images/chronograf_single_host.png)
+
+### Dashboards
+Creating a dashboard is reasonably straight forward in Chronograf. Open up the dashboards panel.
+
+![Chronograf create dashboard](./images/chronograf_create_dashboard.png)
+
+Click on create new dashboard and add in the sources and metrics you wish to monitor.
+
+![Chronograf build dashboard](./images/chronograf_build_dashboard.png)
+
+>
+> Much more information on using Chronograf is available on their official site at https://docs.influxdata.com/chronograf/
+>
+
+
diff --git a/aztk/spark/models/plugins/resource_monitor/start_monitor.sh b/aztk/spark/models/plugins/resource_monitor/start_monitor.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
+
+mkdir /etc/telegraf
+cp ./etc/telegraf.conf /etc/telegraf/telegraf.conf
+
+echo "Install telegraf"
+curl -sL https://repos.influxdata.com/influxdb.key | apt-key add -
+source /etc/lsb-release
+echo "deb https://repos.influxdata.com/${DISTRIB_ID,,} ${DISTRIB_CODENAME} stable" | tee /etc/apt/sources.list.d/influxdb.list
+apt-get update && apt-get install telegraf
+
+if  [ "$AZTK_IS_MASTER" = "true" ]; then
+    echo "Create docker containers"
+    sudo docker-compose up --no-start
+    echo "Run the containers"
+    sudo docker-compose start
+fi
+
+echo "Run telegraf"
+telegraf --config ./etc/telegraf.conf &
diff --git a/aztk/spark/models/plugins/resource_monitor/telegraf.conf b/aztk/spark/models/plugins/resource_monitor/telegraf.conf
@@ -0,0 +1,129 @@
+# Telegraf configuration
+
+# Telegraf is entirely plugin driven. All metrics are gathered from the
+# declared inputs, and sent to the declared outputs.
+
+# Plugins must be declared in here to be active.
+# To deactivate a plugin, comment out the name and any variables.
+
+# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
+# file would generate.
+
+# Global tags can be specified here in key="value" format.
+[global_tags]
+  # dc = "us-east-1" # will tag all metrics with dc=us-east-1
+  # rack = "1a"
+
+# Configuration for telegraf agent
+[agent]
+  ## Default data collection interval for all inputs
+  interval = "10s"
+  ## Rounds collection interval to 'interval'
+  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+  round_interval = true
+
+  ## Telegraf will cache metric_buffer_limit metrics for each output, and will
+  ## flush this buffer on a successful write.
+  metric_buffer_limit = 10000
+  ## Flush the buffer whenever full, regardless of flush_interval.
+  flush_buffer_when_full = true
+
+  ## Collection jitter is used to jitter the collection by a random amount.
+  ## Each plugin will sleep for a random time within jitter before collecting.
+  ## This can be used to avoid many plugins querying things like sysfs at the
+  ## same time, which can have a measurable effect on the system.
+  collection_jitter = "0s"
+
+  ## Default flushing interval for all outputs. You shouldn't set this below
+  ## interval. Maximum flush_interval will be flush_interval + flush_jitter
+  flush_interval = "10s"
+  ## Jitter the flush interval by a random amount. This is primarily to avoid
+  ## large write spikes for users running a large number of telegraf instances.
+  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+  flush_jitter = "0s"
+
+  ## Run telegraf in debug mode
+  debug = false
+  ## Run telegraf in quiet mode
+  quiet = false
+  ## Override default hostname, if empty use os.Hostname()
+  hostname = "$AZ_BATCH_NODE_ID"
+
+
+###############################################################################
+#                                  OUTPUTS                                    #
+###############################################################################
+
+# Configuration for influxdb server to send metrics to
+[[outputs.influxdb]]
+  # The full HTTP or UDP endpoint URL for your InfluxDB instance.
+  # Multiple urls can be specified but it is assumed that they are part of the same
+  # cluster, this means that only ONE of the urls will be written to each interval.
+  # urls = ["udp://localhost:8089"] # UDP endpoint example
+  # urls = ["http://influxdb:8086"] # required
+  urls = ["http://$AZTK_MASTER_IP:8086"]
+  # The target database for metrics (telegraf will create it if not exists)
+  database = "telegraf" # required
+  # Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h".
+  # note: using second precision greatly helps InfluxDB compression
+  precision = "s"
+
+  ## Write timeout (for the InfluxDB client), formatted as a string.
+  ## If not provided, will default to 5s. 0s means no timeout (not recommended).
+  timeout = "5s"
+  # username = "telegraf"
+  # password = "metricsmetricsmetricsmetrics"
+  # Set the user agent for HTTP POSTs (can be useful for log differentiation)
+  # user_agent = "telegraf"
+  # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
+  # udp_payload = 512
+
+
+###############################################################################
+#                                  INPUTS                                     #
+###############################################################################
+
+# Read metrics about cpu usage
+[[inputs.cpu]]
+  # Whether to report per-cpu stats or not
+  percpu = true
+  # Whether to report total system cpu stats or not
+  totalcpu = true
+  # Comment this line if you want the raw CPU time metrics
+  fielddrop = ["time_*"]
+
+# Read metrics about disk usage by mount point
+[[inputs.disk]]
+  # By default, telegraf gather stats for all mountpoints.
+  # Setting mountpoints will restrict the stats to the specified mountpoints.
+  # mount_points=["/"]
+
+  # Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
+  # present on /run, /var/run, /dev/shm or /dev).
+  ignore_fs = ["tmpfs", "devtmpfs"]
+
+# Read metrics about disk IO by device
+[[inputs.diskio]]
+  # By default, telegraf will gather stats for all devices including
+  # disk partitions.
+  # Setting devices will restrict the stats to the specified devices.
+  # devices = ["sda", "sdb"]
+  # Uncomment the following line if you do not need disk serial numbers.
+  # skip_serial_number = true
+
+# Read metrics about memory usage
+[[inputs.mem]]
+  # no configuration
+
+# Read metrics about swap memory usage
+[[inputs.swap]]
+  # no configuration
+
+# Read metrics about system load & uptime
+[[inputs.system]]
+  # no configuration
+
+
+###############################################################################
+#                              SERVICE INPUTS                                 #
+###############################################################################