This repository has been archived by the owner on Feb 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Wip * Fix issues * more tweaks * FIx more * More env renaming * Start * Docker runs now * Wip docker run on node * fix issues * More fix * Starts * Starting spark again * Works * Fix * More fixes * Running plugins on the host works * tweak * Fix: tests * Define plugin docs * Added types * Fix jupyterlab * initial commit with grafana and influxdb * changes to find .env files * start refactor into single plugin * remove unused plugins and add required files to resource_mon * make it work with multiple nodes * remove sudo calls and update default dashboard * fix merge issue * updates to make plugins work in container again * remove bad characters from previous checking * Added test for invalid target and target role * Fix pylint * Rename * Added docs for debug plugins * add docs for resource_monitor plugin * surface passwords in metrics plugin config * surface passwords in metrics plugin config p2 * updated comments * initial work for TICK stack * try getting telegraf working * use tvm name as hostname * update run_on to target_role * update sources to only use tick stack * remove unused external port * update start script * docs * PR feedback * update readme with a warning that data is only local * change chronograf port to use 8890 * remove jars * remove unused port * update docs with new port info
- Loading branch information
Showing
14 changed files
with
284 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from .hdfs import HDFSPlugin | ||
from .jupyter import JupyterPlugin | ||
from .jupyter_lab import JupyterLabPlugin | ||
from .resource_monitor import ResourceMonitorPlugin | ||
from .rstudio_server import RStudioServerPlugin | ||
from .simple import SimplePlugin | ||
from .spark_ui_proxy import SparkUIProxyPlugin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .configuration import * |
26 changes: 26 additions & 0 deletions
26
aztk/spark/models/plugins/resource_monitor/configuration.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import os | ||
from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole | ||
from aztk.models.plugins.plugin_file import PluginFile | ||
from aztk.utils import constants | ||
|
||
dir_path = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
class ResourceMonitorPlugin(PluginConfiguration): | ||
def __init__(self): | ||
super().__init__( | ||
name="resource_monitor", | ||
ports=[ | ||
PluginPort( | ||
internal=8890, | ||
public=True, | ||
), | ||
], | ||
target=PluginTarget.Host, | ||
target_role=PluginTargetRole.All, | ||
execute="start_monitor.sh", | ||
files=[ | ||
PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")), | ||
PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")), | ||
PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")), | ||
] | ||
) |
31 changes: 31 additions & 0 deletions
31
aztk/spark/models/plugins/resource_monitor/docker-compose.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
version: '3' | ||
|
||
services: | ||
# Define an InfluxDB service | ||
influxdb: | ||
image: influxdb:1.3.5 | ||
volumes: | ||
- ./data/influxdb:/var/lib/influxdb | ||
ports: | ||
- "8086:8086" | ||
# Define a Chronograf service | ||
chronograf: | ||
image: chronograf:1.3.8 | ||
environment: | ||
INFLUXDB_URL: http://influxdb:8086 | ||
KAPACITOR_URL: http://kapacitor:9092 | ||
ports: | ||
- "8890:8888" | ||
links: | ||
- influxdb | ||
- kapacitor | ||
# Define a Kapacitor service | ||
kapacitor: | ||
image: kapacitor:1.3.3 | ||
environment: | ||
KAPACITOR_HOSTNAME: kapacitor | ||
KAPACITOR_INFLUXDB_0_URLS_0: http://influxdb:8086 | ||
links: | ||
- influxdb | ||
ports: | ||
- "9092:9092" |
Binary file added
BIN
+121 KB
aztk/spark/models/plugins/resource_monitor/images/chronograf_build_dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+34.7 KB
aztk/spark/models/plugins/resource_monitor/images/chronograf_create_dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+97.4 KB
aztk/spark/models/plugins/resource_monitor/images/chronograf_hosts.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+68.8 KB
aztk/spark/models/plugins/resource_monitor/images/chronograf_single_host.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# Using the Resrouce Monitor Plugin | ||
|
||
The resource monitor plugin is useful for tracking performance counters on the cluster. These include counters such as Percent CPU used per core, Disk Read, Disk Write, Network In, Network out, and several others. Simply enabling the plugin in your cluster.yaml will deploy all the necessary components to start tracking metrics. | ||
|
||
This plugin takes advanage of the TICK monitoring stack. For more information please visit the [influx data](https://www.influxdata.com/time-series-platform/) web page. | ||
|
||
> **IMPORTANT** All of the data is collected on the cluster's master node and will be lost once the cluster is thrown away. To persist data we recommend pushing to an off-cluster InfluxDB instance. Currently there is no supported way to persist the data from this plugin. | ||
## Setup | ||
|
||
Update your cluster.yaml file to include the plugin as follows: | ||
|
||
```yaml | ||
... | ||
|
||
plugins: | ||
- name: resource_monitor | ||
|
||
... | ||
|
||
|
||
``` | ||
|
||
Once the cluster is created simply the cluster ssh command and all of the ports will automatically get forwareded. | ||
|
||
```sh | ||
aztk spark cluster ssh --id <my_cluster> | ||
``` | ||
|
||
### Ports | ||
url | desciption | ||
--- | --- | ||
http://localhost:8890 | Cronograf UI | ||
|
||
## Visualize data in Chronograf | ||
|
||
All data will automatically be published to the InfluxDB. In addition, this plugin also configures Chronograf to ingest the data for queries and visualization. After running **aztk spark cluster ssh --id <cluster_id>** navigate to http://localhost:8888 to open the UI. | ||
|
||
|
||
### Host metrics | ||
Each node (a.k.a. 'host' in Chronograf) in the cluster will register itself with the InfluxDB and start pushing metrics. | ||
![Chronograf hosts](./images/chronograf_hosts.png) | ||
|
||
Clicking on any individual host will give you more detail and historical metrics. | ||
|
||
![Chronograf single host](./images/chronograf_single_host.png) | ||
|
||
### Dashboards | ||
Creating a dashboard is reasonably straight forward in Chronograf. Open up the dashboards panel. | ||
|
||
![Chronograf create dashboard](./images/chronograf_create_dashboard.png) | ||
|
||
Click on create new dashboard and add in the sources and metrics you wish to monitor. | ||
|
||
![Chronograf build dashboard](./images/chronograf_build_dashboard.png) | ||
|
||
> | ||
> Much more information on using Chronograf is available on their official site at https://docs.influxdata.com/chronograf/ | ||
> | ||
|
22 changes: 22 additions & 0 deletions
22
aztk/spark/models/plugins/resource_monitor/start_monitor.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
cd $DIR | ||
|
||
mkdir /etc/telegraf | ||
cp ./etc/telegraf.conf /etc/telegraf/telegraf.conf | ||
|
||
echo "Install telegraf" | ||
curl -sL https://repos.influxdata.com/influxdb.key | apt-key add - | ||
source /etc/lsb-release | ||
echo "deb https://repos.influxdata.com/${DISTRIB_ID,,} ${DISTRIB_CODENAME} stable" | tee /etc/apt/sources.list.d/influxdb.list | ||
apt-get update && apt-get install telegraf | ||
|
||
if [ "$AZTK_IS_MASTER" = "true" ]; then | ||
echo "Create docker containers" | ||
sudo docker-compose up --no-start | ||
echo "Run the containers" | ||
sudo docker-compose start | ||
fi | ||
|
||
echo "Run telegraf" | ||
telegraf --config ./etc/telegraf.conf & |
129 changes: 129 additions & 0 deletions
129
aztk/spark/models/plugins/resource_monitor/telegraf.conf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# Telegraf configuration | ||
|
||
# Telegraf is entirely plugin driven. All metrics are gathered from the | ||
# declared inputs, and sent to the declared outputs. | ||
|
||
# Plugins must be declared in here to be active. | ||
# To deactivate a plugin, comment out the name and any variables. | ||
|
||
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config | ||
# file would generate. | ||
|
||
# Global tags can be specified here in key="value" format. | ||
[global_tags] | ||
# dc = "us-east-1" # will tag all metrics with dc=us-east-1 | ||
# rack = "1a" | ||
|
||
# Configuration for telegraf agent | ||
[agent] | ||
## Default data collection interval for all inputs | ||
interval = "10s" | ||
## Rounds collection interval to 'interval' | ||
## ie, if interval="10s" then always collect on :00, :10, :20, etc. | ||
round_interval = true | ||
|
||
## Telegraf will cache metric_buffer_limit metrics for each output, and will | ||
## flush this buffer on a successful write. | ||
metric_buffer_limit = 10000 | ||
## Flush the buffer whenever full, regardless of flush_interval. | ||
flush_buffer_when_full = true | ||
|
||
## Collection jitter is used to jitter the collection by a random amount. | ||
## Each plugin will sleep for a random time within jitter before collecting. | ||
## This can be used to avoid many plugins querying things like sysfs at the | ||
## same time, which can have a measurable effect on the system. | ||
collection_jitter = "0s" | ||
|
||
## Default flushing interval for all outputs. You shouldn't set this below | ||
## interval. Maximum flush_interval will be flush_interval + flush_jitter | ||
flush_interval = "10s" | ||
## Jitter the flush interval by a random amount. This is primarily to avoid | ||
## large write spikes for users running a large number of telegraf instances. | ||
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s | ||
flush_jitter = "0s" | ||
|
||
## Run telegraf in debug mode | ||
debug = false | ||
## Run telegraf in quiet mode | ||
quiet = false | ||
## Override default hostname, if empty use os.Hostname() | ||
hostname = "$AZ_BATCH_NODE_ID" | ||
|
||
|
||
############################################################################### | ||
# OUTPUTS # | ||
############################################################################### | ||
|
||
# Configuration for influxdb server to send metrics to | ||
[[outputs.influxdb]] | ||
# The full HTTP or UDP endpoint URL for your InfluxDB instance. | ||
# Multiple urls can be specified but it is assumed that they are part of the same | ||
# cluster, this means that only ONE of the urls will be written to each interval. | ||
# urls = ["udp://localhost:8089"] # UDP endpoint example | ||
# urls = ["http://influxdb:8086"] # required | ||
urls = ["http://$AZTK_MASTER_IP:8086"] | ||
# The target database for metrics (telegraf will create it if not exists) | ||
database = "telegraf" # required | ||
# Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h". | ||
# note: using second precision greatly helps InfluxDB compression | ||
precision = "s" | ||
|
||
## Write timeout (for the InfluxDB client), formatted as a string. | ||
## If not provided, will default to 5s. 0s means no timeout (not recommended). | ||
timeout = "5s" | ||
# username = "telegraf" | ||
# password = "metricsmetricsmetricsmetrics" | ||
# Set the user agent for HTTP POSTs (can be useful for log differentiation) | ||
# user_agent = "telegraf" | ||
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) | ||
# udp_payload = 512 | ||
|
||
|
||
############################################################################### | ||
# INPUTS # | ||
############################################################################### | ||
|
||
# Read metrics about cpu usage | ||
[[inputs.cpu]] | ||
# Whether to report per-cpu stats or not | ||
percpu = true | ||
# Whether to report total system cpu stats or not | ||
totalcpu = true | ||
# Comment this line if you want the raw CPU time metrics | ||
fielddrop = ["time_*"] | ||
|
||
# Read metrics about disk usage by mount point | ||
[[inputs.disk]] | ||
# By default, telegraf gather stats for all mountpoints. | ||
# Setting mountpoints will restrict the stats to the specified mountpoints. | ||
# mount_points=["/"] | ||
|
||
# Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually | ||
# present on /run, /var/run, /dev/shm or /dev). | ||
ignore_fs = ["tmpfs", "devtmpfs"] | ||
|
||
# Read metrics about disk IO by device | ||
[[inputs.diskio]] | ||
# By default, telegraf will gather stats for all devices including | ||
# disk partitions. | ||
# Setting devices will restrict the stats to the specified devices. | ||
# devices = ["sda", "sdb"] | ||
# Uncomment the following line if you do not need disk serial numbers. | ||
# skip_serial_number = true | ||
|
||
# Read metrics about memory usage | ||
[[inputs.mem]] | ||
# no configuration | ||
|
||
# Read metrics about swap memory usage | ||
[[inputs.swap]] | ||
# no configuration | ||
|
||
# Read metrics about system load & uptime | ||
[[inputs.system]] | ||
# no configuration | ||
|
||
|
||
############################################################################### | ||
# SERVICE INPUTS # | ||
############################################################################### |