Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Commit

Permalink
Feature: monitor tick (#508)
Browse files Browse the repository at this point in the history
* Wip

* Fix issues

* more tweaks

* FIx more

* More env renaming

* Start

* Docker runs now

* Wip docker run on node

* fix issues

* More fix

* Starts

* Starting spark again

* Works

* Fix

* More fixes

* Running plugins on the host works

* tweak

* Fix: tests

* Define plugin docs

* Added types

* Fix jupyterlab

* initial commit with grafana and influxdb

* changes to find .env files

* start refactor into single plugin

* remove unused plugins and add required files to resource_mon

* make it work with multiple nodes

* remove sudo calls and update default dashboard

* fix merge issue

* updates to make plugins work in container again

* remove bad characters from previous checking

* Added test for invalid target and target role

* Fix pylint

* Rename

* Added docs for debug plugins

* add docs for resource_monitor plugin

* surface passwords in metrics plugin config

* surface passwords in metrics plugin config p2

* updated comments

* initial work for TICK stack

* try getting telegraf working

* use tvm name as hostname

* update run_on to target_role

* update sources to only use tick stack

* remove unused external port

* update start script

* docs

* PR feedback

* update readme with a warning that data is only local

* change chronograf port to use 8890

* remove jars

* remove unused port

* update docs with new port info
  • Loading branch information
paselem authored May 1, 2018
1 parent 779bffb commit 23c97de
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 4 deletions.
5 changes: 2 additions & 3 deletions aztk/internal/cluster_data/node_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def add_dir(self, path: str, dest: str = None, exclude: List[str] = None):
relative_folder = os.path.relpath(base, path)
for file in files:
if self._includeFile(file, exclude):
with io.open(os.path.join(base, file), 'r', encoding='UTF-8') as f:
self.zipf.writestr(os.path.join(dest, relative_folder, file), f.read().replace('\r\n', '\n'))
self.add_file(os.path.join(base, file), os.path.join(dest, relative_folder), binary = False)

def _add_custom_scripts(self):
data = []
Expand Down Expand Up @@ -155,7 +154,7 @@ def _add_plugins(self):
return zipf

def _add_node_scripts(self):
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*'])
self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*', '*.png'])

def _includeFile(self, filename: str, exclude: List[str]) -> bool:
exclude = exclude or []
Expand Down
1 change: 1 addition & 0 deletions aztk/models/plugins/internal/plugin_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class PluginManager:
plugins = dict(
jupyter=plugins.JupyterPlugin,
jupyterlab=plugins.JupyterLabPlugin,
resource_monitor=plugins.ResourceMonitorPlugin,
rstudio_server=plugins.RStudioServerPlugin,
hdfs=plugins.HDFSPlugin,
simple=plugins.SimplePlugin,
Expand Down
11 changes: 10 additions & 1 deletion aztk/node_scripts/install/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,15 @@ def _plugins_dir():


def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):

print("Loading plugin {} in {} on {}".format(
plugin_obj["execute"],
plugin_obj['target'],
plugin_obj['target_role']
))

if plugin_obj['target'] != target.value:
print("Ignoring ", plugin_obj["execute"], " as target is for ", plugin_obj['target'], " but is currently running in ", target.value)
print("Ignoring ", plugin_obj["execute"], "as target is for ", plugin_obj['target'], "but is currently running in ", target.value)
return False

if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True:
Expand All @@ -49,6 +56,8 @@ def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker):
if plugin_obj['target_role'] == PluginTargetRole.All.value:
return True

print("Ignoring plugin", plugin_obj["execute"], "as target role is ", plugin_obj['target_role'], "and node is master: ", is_master, is_worker)

return False


Expand Down
1 change: 1 addition & 0 deletions aztk/spark/models/plugins/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .hdfs import HDFSPlugin
from .jupyter import JupyterPlugin
from .jupyter_lab import JupyterLabPlugin
from .resource_monitor import ResourceMonitorPlugin
from .rstudio_server import RStudioServerPlugin
from .simple import SimplePlugin
from .spark_ui_proxy import SparkUIProxyPlugin
1 change: 1 addition & 0 deletions aztk/spark/models/plugins/resource_monitor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .configuration import *
26 changes: 26 additions & 0 deletions aztk/spark/models/plugins/resource_monitor/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole
from aztk.models.plugins.plugin_file import PluginFile
from aztk.utils import constants

dir_path = os.path.dirname(os.path.realpath(__file__))

class ResourceMonitorPlugin(PluginConfiguration):
def __init__(self):
super().__init__(
name="resource_monitor",
ports=[
PluginPort(
internal=8890,
public=True,
),
],
target=PluginTarget.Host,
target_role=PluginTargetRole.All,
execute="start_monitor.sh",
files=[
PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")),
PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")),
PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")),
]
)
31 changes: 31 additions & 0 deletions aztk/spark/models/plugins/resource_monitor/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
version: '3'

services:
# Define an InfluxDB service
influxdb:
image: influxdb:1.3.5
volumes:
- ./data/influxdb:/var/lib/influxdb
ports:
- "8086:8086"
# Define a Chronograf service
chronograf:
image: chronograf:1.3.8
environment:
INFLUXDB_URL: http://influxdb:8086
KAPACITOR_URL: http://kapacitor:9092
ports:
- "8890:8888"
links:
- influxdb
- kapacitor
# Define a Kapacitor service
kapacitor:
image: kapacitor:1.3.3
environment:
KAPACITOR_HOSTNAME: kapacitor
KAPACITOR_INFLUXDB_0_URLS_0: http://influxdb:8086
links:
- influxdb
ports:
- "9092:9092"
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
61 changes: 61 additions & 0 deletions aztk/spark/models/plugins/resource_monitor/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Using the Resrouce Monitor Plugin

The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics.

This plugin takes advanage of the TICK monitoring stack. For more information please visit the [influx data](https://www.influxdata.com/time-series-platform/) web page.

> **IMPORTANT** All of the data is collected on the cluster's master node and will be lost once the cluster is thrown away. To persist data we recommend pushing to an off-cluster InfluxDB instance. Currently there is no supported way to persist the data from this plugin.
## Setup

Update your cluster.yaml file to include the plugin as follows:

```yaml
...

plugins:
- name: resource_monitor

...


```

Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded.

```sh
aztk spark cluster ssh --id <my_cluster>
```

### Ports
url | desciption
--- | ---
http://localhost:8890 | Cronograf UI

## Visualize data in Chronograf

All data will automatically be published to the InfluxDB. In addition, this plugin also configures Chronograf to ingest the data for queries and visualization. After running **aztk spark cluster ssh --id <cluster_id>** navigate to http://localhost:8888 to open the UI.


### Host metrics
Each node (a.k.a. 'host' in Chronograf) in the cluster will register itself with the InfluxDB and start pushing metrics.
![Chronograf hosts](./images/chronograf_hosts.png)

Clicking on any individual host will give you more detail and historical metrics.

![Chronograf single host](./images/chronograf_single_host.png)

### Dashboards
Creating a dashboard is reasonably straight forward in Chronograf. Open up the dashboards panel.

![Chronograf create dashboard](./images/chronograf_create_dashboard.png)

Click on create new dashboard and add in the sources and metrics you wish to monitor.

![Chronograf build dashboard](./images/chronograf_build_dashboard.png)

>
> Much more information on using Chronograf is available on their official site at https://docs.influxdata.com/chronograf/
>

22 changes: 22 additions & 0 deletions aztk/spark/models/plugins/resource_monitor/start_monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd $DIR

mkdir /etc/telegraf
cp ./etc/telegraf.conf /etc/telegraf/telegraf.conf

echo "Install telegraf"
curl -sL https://repos.influxdata.com/influxdb.key | apt-key add -
source /etc/lsb-release
echo "deb https://repos.influxdata.com/${DISTRIB_ID,,} ${DISTRIB_CODENAME} stable" | tee /etc/apt/sources.list.d/influxdb.list
apt-get update && apt-get install telegraf

if [ "$AZTK_IS_MASTER" = "true" ]; then
echo "Create docker containers"
sudo docker-compose up --no-start
echo "Run the containers"
sudo docker-compose start
fi

echo "Run telegraf"
telegraf --config ./etc/telegraf.conf &
129 changes: 129 additions & 0 deletions aztk/spark/models/plugins/resource_monitor/telegraf.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Telegraf configuration

# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.

# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.

# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.

# Global tags can be specified here in key="value" format.
[global_tags]
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"

# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "10s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true

## Telegraf will cache metric_buffer_limit metrics for each output, and will
## flush this buffer on a successful write.
metric_buffer_limit = 10000
## Flush the buffer whenever full, regardless of flush_interval.
flush_buffer_when_full = true

## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"

## Default flushing interval for all outputs. You shouldn't set this below
## interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"

## Run telegraf in debug mode
debug = false
## Run telegraf in quiet mode
quiet = false
## Override default hostname, if empty use os.Hostname()
hostname = "$AZ_BATCH_NODE_ID"


###############################################################################
# OUTPUTS #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
# The full HTTP or UDP endpoint URL for your InfluxDB instance.
# Multiple urls can be specified but it is assumed that they are part of the same
# cluster, this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
# urls = ["http://influxdb:8086"] # required
urls = ["http://$AZTK_MASTER_IP:8086"]
# The target database for metrics (telegraf will create it if not exists)
database = "telegraf" # required
# Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h".
# note: using second precision greatly helps InfluxDB compression
precision = "s"

## Write timeout (for the InfluxDB client), formatted as a string.
## If not provided, will default to 5s. 0s means no timeout (not recommended).
timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512


###############################################################################
# INPUTS #
###############################################################################

# Read metrics about cpu usage
[[inputs.cpu]]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
fielddrop = ["time_*"]

# Read metrics about disk usage by mount point
[[inputs.disk]]
# By default, telegraf gather stats for all mountpoints.
# Setting mountpoints will restrict the stats to the specified mountpoints.
# mount_points=["/"]

# Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
# present on /run, /var/run, /dev/shm or /dev).
ignore_fs = ["tmpfs", "devtmpfs"]

# Read metrics about disk IO by device
[[inputs.diskio]]
# By default, telegraf will gather stats for all devices including
# disk partitions.
# Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb"]
# Uncomment the following line if you do not need disk serial numbers.
# skip_serial_number = true

# Read metrics about memory usage
[[inputs.mem]]
# no configuration

# Read metrics about swap memory usage
[[inputs.swap]]
# no configuration

# Read metrics about system load & uptime
[[inputs.system]]
# no configuration


###############################################################################
# SERVICE INPUTS #
###############################################################################

0 comments on commit 23c97de

Please sign in to comment.