From 766e41a00c26efef349c652973b4855cd336308a Mon Sep 17 00:00:00 2001
From: Aditya Purang <44022838+aditya-purang@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:29:10 +0100
Subject: [PATCH] add integration tests for AWS Neuron (#416)
---
.../resources/dummy-neuron-monitor/Dockerfile | 32 +
.../dummy_neuron_monitor.py | 890 ++++++++++++++++++
generator/test_case_generator.go | 4 +
terraform/eks/daemon/awsneuron/main.tf | 826 ++++++++++++++++
terraform/eks/daemon/awsneuron/providers.tf | 17 +
terraform/eks/daemon/awsneuron/variables.tf | 28 +
test/awsneuron/neuron_metrics_test.go | 63 ++
test/awsneuron/neuron_test.go | 77 ++
test/awsneuron/resources/config.json | 16 +
test/awsneuron/resources/httpd-ssl.conf | 43 +
test/awsneuron/resources/httpd.conf | 101 ++
test/awsneuron/resources/metrics_list.go | 48 +
test/metric/container_insights_util.go | 46 +
.../test_schemas/container_neuroncore.json | 50 +
.../test_schemas/node_neuron.json | 33 +
.../test_schemas/node_neuroncore.json | 42 +
.../test_schemas/node_neurondevice.json | 34 +
.../test_schemas/pod_neuroncore.json | 49 +
.../eks_resources/util.go | 57 +-
util/awsservice/cloudwatchlogs.go | 26 +
20 files changed, 2464 insertions(+), 18 deletions(-)
create mode 100644 docs/resources/dummy-neuron-monitor/Dockerfile
create mode 100644 docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py
create mode 100644 terraform/eks/daemon/awsneuron/main.tf
create mode 100644 terraform/eks/daemon/awsneuron/providers.tf
create mode 100644 terraform/eks/daemon/awsneuron/variables.tf
create mode 100644 test/awsneuron/neuron_metrics_test.go
create mode 100644 test/awsneuron/neuron_test.go
create mode 100644 test/awsneuron/resources/config.json
create mode 100644 test/awsneuron/resources/httpd-ssl.conf
create mode 100644 test/awsneuron/resources/httpd.conf
create mode 100644 test/awsneuron/resources/metrics_list.go
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json
diff --git a/docs/resources/dummy-neuron-monitor/Dockerfile b/docs/resources/dummy-neuron-monitor/Dockerfile
new file mode 100644
index 000000000..47b06ba33
--- /dev/null
+++ b/docs/resources/dummy-neuron-monitor/Dockerfile
@@ -0,0 +1,32 @@
+# set the base image
+FROM public.ecr.aws/docker/library/ubuntu:20.04
+
+# Set the working directory in the container
+WORKDIR /root
+
+# Neuron SDK components version numbers
+ARG NEURONX_RUNTIME_LIB_VERSION=2.19.*
+ARG NEURONX_COLLECTIVES_LIB_VERSION=2.19.*
+ARG NEURONX_TOOLS_VERSION=2.17.*
+
+# Install any necessary dependencies or tools
+RUN apt-get update && apt-get install vim wget zip unzip sudo python3-pip -y
+
+# Import Neuron GPG public key
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+
+# Install Neuron Runtime and Neuron Tools
+RUN apt-get update \
+ && apt-get install -y \
+ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
+ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
+ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+COPY dummy_neuron_monitor.py /opt/aws/neuron/bin/dummy_neuron_monitor.py
+RUN chmod 755 /opt/aws/neuron/bin/dummy_neuron_monitor.py
+RUN pip3 install prometheus_client boto3 requests
\ No newline at end of file
diff --git a/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py b/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py
new file mode 100644
index 000000000..22ccf7e74
--- /dev/null
+++ b/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py
@@ -0,0 +1,890 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import argparse
+import signal
+import hashlib
+import time
+from prometheus_client import start_http_server, Gauge, Counter, Info
+
+
+def get_instance_labels(instance_info):
+ instance_labels = {
+ 'instance_name': instance_info['instance_name'],
+ 'instance_id': instance_info['instance_id'],
+ 'instance_type': instance_info['instance_type'],
+ 'availability_zone': instance_info['instance_availability_zone'],
+ 'region': instance_info['instance_region'],
+ 'subnet_id': instance_info['subnet_id']
+ }
+ return instance_labels
+
+
+def get_runtime_labels(instance_info, runtime_tag):
+ label_dict = instance_info.copy()
+ label_dict['runtime_tag'] = runtime_tag
+ return label_dict
+
+
+def process_neuroncore_counters(group_obj, data, labels):
+ gauge_name = 'neuroncore_utilization_ratio'
+ labels['neuroncore'] = None
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'NeuronCore utilization ratio', labels.keys())
+ for nc_idx, nc_data in data['neuroncores_in_use'].items():
+ labels['neuroncore'] = int(nc_idx)
+ group_obj[gauge_name].labels(**labels).set(nc_data['neuroncore_utilization'] / 100.0)
+
+
+def process_neuron_runtime_vcpu_usage(group_obj, data, labels):
+ gauge_name = 'neuron_runtime_vcpu_usage_ratio'
+ labels['usage_type'] = None
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'Runtime vCPU utilization ratio', labels.keys())
+ cpu_usage_fields = ['user', 'system']
+ for field in cpu_usage_fields:
+ labels['usage_type'] = field
+ group_obj[gauge_name].labels(**labels).set(data['vcpu_usage'][field] / 100.0)
+
+
+def process_memory_used(group_obj, data, labels):
+ gauge_name = 'neuron_runtime_memory_used_bytes'
+ labels['memory_location'] = None
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'Runtime memory used bytes', labels.keys())
+ mem_locations = ['host', 'neuron_device']
+ for mem_location_type in mem_locations:
+ labels['memory_location'] = mem_location_type
+ group_obj[gauge_name].labels(**labels).set(data['neuron_runtime_used_bytes'][mem_location_type])
+
+ gauge_name_prefix = 'neuroncore_memory_usage_{}'
+ labels['neuroncore'] = None
+ labels['memory_location'] = None
+ neuroncore_memory_usage_type = ['constants', 'model_code', 'model_shared_scratchpad', 'runtime_memory', 'tensors']
+ for memory_usage_type in neuroncore_memory_usage_type:
+ gauge_name = gauge_name_prefix.format(memory_usage_type)
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'NeuronCore memory utilization for {}'.format(memory_usage_type), labels.keys())
+ for nc_idx, nc_data in data['neuron_runtime_used_bytes']['usage_breakdown']['neuroncore_memory_usage'].items():
+ labels['neuroncore'] = int(nc_idx)
+ group_obj[gauge_name].labels(**labels).set(nc_data[memory_usage_type])
+
+
+def process_execution_stats(group_obj, data, labels):
+ counter_name = 'execution_errors_total'
+ err_labels = labels.copy()
+ err_labels['error_type'] = None
+ if counter_name not in group_obj:
+ group_obj[counter_name] = Counter(counter_name, 'Execution errors total', err_labels.keys())
+ error_summary = data['error_summary']
+ for error_type in error_summary:
+ err_labels['error_type'] = error_type
+ group_obj[counter_name].labels(**err_labels).inc(error_summary[error_type])
+
+ counter_name = 'execution_status_total'
+ status_labels = labels.copy()
+ status_labels['status_type'] = None
+ if counter_name not in group_obj:
+ group_obj[counter_name] = Counter(counter_name, 'Execution status total', status_labels.keys())
+ execution_summary = data['execution_summary']
+ for execution_outcome in execution_summary:
+ status_labels['status_type'] = execution_outcome
+ group_obj[counter_name].labels(**status_labels).inc(execution_summary[execution_outcome])
+
+ gauge_name = 'execution_latency_seconds'
+ latency_labels = labels.copy()
+ latency_labels['percentile'] = None
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'Execution latency in seconds', latency_labels.keys())
+ latency_stats = data['latency_stats']
+ if latency_stats['total_latency'] is not None:
+ for percentile in latency_stats['total_latency']:
+ latency_labels['percentile'] = percentile
+ group_obj[gauge_name].labels(**latency_labels).set(latency_stats['total_latency'][percentile])
+
+
+def process_neuron_hw_counters(group_obj, data, labels):
+ counter_name = 'hardware_ecc_events_total'
+ labels['event_type'] = None
+ labels['neuron_device_index'] = None
+ if counter_name not in group_obj:
+ group_obj[counter_name] = Counter(counter_name, 'Hardware ecc events total', labels.keys())
+ hw_counters = ['mem_ecc_corrected', 'mem_ecc_uncorrected', 'sram_ecc_corrected', 'sram_ecc_uncorrected']
+ for device in data['neuron_devices']:
+ for counter in hw_counters:
+ labels['event_type'] = counter
+ labels['neuron_device_index'] = device['neuron_device_index']
+ group_obj[counter_name].labels(**labels).inc(device[counter])
+
+
+def process_vcpu_usage(group_obj, data, labels):
+ cpu_usage_aggregation = {
+ 'user': ['user', 'nice'],
+ 'system': ['system', 'io_wait', 'irq', 'soft_irq']
+ }
+ gauge_name = 'system_vcpu_count'
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'System vCPU count', labels.keys())
+ group_obj[gauge_name].labels(**labels).set(len(data['usage_data']))
+
+ labels['usage_type'] = None
+ gauge_name = 'system_vcpu_usage_ratio'
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name, 'System CPU utilization ratio', labels.keys())
+ for field, aggregated in cpu_usage_aggregation.items():
+ aggregate_value = sum([data['average_usage'][item] for item in aggregated])
+ aggregate_value = min(aggregate_value, 100.0)
+ labels['usage_type'] = field
+ group_obj[gauge_name].labels(**labels).set(aggregate_value / 100.0)
+
+
+def process_memory_info(group_obj, data, labels):
+ for entries in [('memory', 'system_memory'), ('swap', 'system_swap')]:
+ for stat in ['total_bytes', 'used_bytes']:
+ gauge_name = '{}_{}'.format(entries[1], stat)
+ if gauge_name not in group_obj:
+ group_obj[gauge_name] = Gauge(gauge_name,
+ 'System {} {} bytes'.format(entries[0], stat), labels.keys())
+ src_entry = '{}_{}'.format(entries[0], stat)
+ group_obj[gauge_name].labels(**labels).set(data[src_entry])
+
+
+def process_neuron_hardware_info(metric_objects, data, instance_data):
+ if 'neuron_hardware_info' not in metric_objects:
+ neuron_labels = {
+ 'neuron_device_count': str(data['neuron_device_count']),
+ 'neuroncore_per_device_count': str(data['neuroncore_per_device_count'])
+ }
+ neuron_labels.update(instance_data)
+
+ metric_objects['neuron_hardware_info'] = Info('neuron_hardware', 'Neuron Hardware Information')
+ metric_objects['neuron_hardware_info'].info(neuron_labels)
+
+
+def process_instance_info(metric_objects, instance_data):
+ if 'instance_info' not in metric_objects:
+ metric_objects['instance_info'] = Info('instance', 'EC2 instance information')
+ metric_objects['instance_info'].info(instance_data)
+
+
+def process_report_entries(metric_objects, report_entries, labels, runtime_tag=None):
+ for metric_group_name, metric_group_data in report_entries.items():
+ handler_name = 'process_{}'.format(metric_group_name)
+ if handler_name in globals():
+ crt_error = metric_group_data['error']
+ if crt_error == '':
+ if metric_group_name not in metric_objects:
+ metric_objects[metric_group_name] = {}
+ metric_group_object = metric_objects[metric_group_name]
+ globals()[handler_name](metric_group_object, metric_group_data, labels.copy())
+ else:
+ if runtime_tag is not None:
+ print('Error getting {} for runtime tag {}: {}'.format(
+ metric_group_name, runtime_tag, crt_error), file=sys.stderr)
+ else:
+ print('Error getting {}: {}'.format(metric_group_name, crt_error), file=sys.stderr)
+
+
+def process_data(metric_objects, monitor_data, instance_info):
+ if monitor_data.get('neuron_runtime_data', []):
+ for runtime in monitor_data['neuron_runtime_data']:
+ runtime_tag = runtime['neuron_runtime_tag']
+
+ if runtime['error'] != '':
+ print('Runtime {} error: {}'.format(runtime_tag, runtime['error']), file=sys.stderr)
+ continue
+
+ process_report_entries(metric_objects, runtime['report'],
+ get_runtime_labels(instance_info, runtime_tag), runtime_tag)
+ else: # Reset gauges if no nueron_runtime is running
+ clear_gauges_from_metric_objects(metric_objects)
+
+ if monitor_data['system_data'] is not None:
+ process_report_entries(metric_objects, monitor_data['system_data'], instance_info)
+ process_instance_info(metric_objects, instance_info)
+ if monitor_data['neuron_hardware_info'] is not None:
+ process_neuron_hardware_info(metric_objects, monitor_data['neuron_hardware_info'], instance_info)
+
+def clear_gauges_from_metric_objects(all_metric_objects):
+ for _, metricGroupedObjects in all_metric_objects.items():
+ if(isinstance(metricGroupedObjects,dict)):
+ for _, metrics in metricGroupedObjects.items():
+ if metrics._type == 'gauge' or metrics._type == 'counter':
+ metrics._metrics.clear()
+
+def _calculate_file_hash(file_path):
+ with open(file_path, "rb") as f:
+ file_hash = hashlib.sha256(f.read()).hexdigest()
+ return file_hash
+
+def _update_ssl_cxt(certfile, keyfile):
+ global ssl_cxt
+ ssl_cxt.load_cert_chain(certfile=certfile, keyfile=keyfile)
+ print("Refreshing TLS certificates")
+
+def _watch_file_and_update_ssl_cxt(original_hash, certfile, keyfile):
+ current_certfile_hash = _calculate_file_hash(certfile)
+ current_keyfile_hash = _calculate_file_hash(keyfile)
+ if (original_hash["certfile"] != current_certfile_hash) or (original_hash["keyfile"] != current_keyfile_hash):
+ _update_ssl_cxt(certfile, keyfile)
+ original_hash["certfile"] = current_certfile_hash
+ original_hash["keyfile"] = current_keyfile_hash
+
+def update_loop(certfile, keyfile):
+ running = True
+
+ def signal_handler(*_):
+ nonlocal running
+ running = False
+ signal.signal(signal.SIGINT, signal_handler)
+
+ """ Dictionary containing all prometheus client objects, first by metric group and
+ then by metric, for example, for neuroncore_counters:
+ all_metric_objects['neuroncore_counters']['neuroncore_utilization_ratio'] = Gauge(...)
+ """
+ all_metric_objects = {}
+ original_file_hash = {}
+ instance_labels = None
+ if certfile and keyfile:
+ certfile_hash = _calculate_file_hash(certfile)
+ keyfile_hash = _calculate_file_hash(keyfile)
+ original_file_hash = {"certfile":certfile_hash,"keyfile":keyfile_hash}
+
+ while running:
+ line = ('{"neuron_runtime_data":[{"pid":457402,"neuron_runtime_tag":"367","error":"","report":{'
+ '"execution_stats":{"period":4.999666547,"error_summary":{"generic":2,"numerical":0,"transient":0,'
+ '"model":0,"runtime":0,"hardware":0},"execution_summary":{"completed":2,"completed_with_err":0,'
+ '"completed_with_num_err":0,"timed_out":0,"incorrect_input":0,"failed_to_queue":0},"latency_stats":{'
+ '"total_latency":null,"device_latency":null},"error":""},"memory_used":{"period":4.999671285,'
+ '"neuron_runtime_used_bytes":{"host":9043968,"neuron_device":3541303936,"usage_breakdown":{"host":{'
+ '"application_memory":655360,"constants":0,"dma_buffers":8388608,"tensors":0},'
+ '"neuroncore_memory_usage":{"0":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"1":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"2":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"3":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"4":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"5":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"6":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"7":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"8":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"9":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"10":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"11":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"12":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"13":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"14":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"15":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"16":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"17":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"18":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"19":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"20":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"21":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"22":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"23":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"24":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"25":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"26":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"27":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"28":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"29":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"30":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"31":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852}}}},"loaded_models":[{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10019,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10005,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10007,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10013,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10029,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10032,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10004,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10012,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10001,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10016,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10022,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10024,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10026,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10031,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10025,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10021,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10006,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10011,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10010,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10015,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10008,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10027,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10018,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10030,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10017,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10003,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10002,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10009,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10020,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10028,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10014,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":8}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10023,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":8}}}],"error":""},"neuron_runtime_vcpu_usage":{"period":4.999647382,'
+ '"vcpu_usage":{"user":0,"system":0},"error":"open/proc/457402/stat:nosuchfileordirectory"},'
+ '"neuroncore_counters":{"period":4.999667932,"neuroncores_in_use":{"0":{"neuroncore_utilization":0},'
+ '"1":{"neuroncore_utilization":0},"2":{"neuroncore_utilization":0},"3":{"neuroncore_utilization":0},'
+ '"4":{"neuroncore_utilization":0},"5":{"neuroncore_utilization":0},"6":{"neuroncore_utilization": 41.4},'
+ '"7":{"neuroncore_utilization":0},"8":{"neuroncore_utilization":0},"9":{"neuroncore_utilization":0},'
+ '"10":{"neuroncore_utilization":0},"11":{"neuroncore_utilization":0},'
+ '"12":{"neuroncore_utilization":0},"13":{"neuroncore_utilization":0},'
+ '"14":{"neuroncore_utilization":0},"15":{"neuroncore_utilization":0},'
+ '"16":{"neuroncore_utilization":0},"17":{"neuroncore_utilization":0},'
+ '"18":{"neuroncore_utilization":0},"19":{"neuroncore_utilization":0},'
+ '"20":{"neuroncore_utilization":0},"21":{"neuroncore_utilization":0},'
+ '"22":{"neuroncore_utilization":0},"23":{"neuroncore_utilization":0},'
+ '"24":{"neuroncore_utilization":0},"25":{"neuroncore_utilization":0},'
+ '"26":{"neuroncore_utilization":0},"27":{"neuroncore_utilization":0},'
+ '"28":{"neuroncore_utilization":0},"29":{"neuroncore_utilization":0},'
+ '"30":{"neuroncore_utilization":0},"31":{"neuroncore_utilization":0}},"error":""}}}, {"pid":457402,"neuron_runtime_tag":"123","error":"","report":{'
+ '"execution_stats":{"period":4.999666547,"error_summary":{"generic":2,"numerical":0,"transient":0,'
+ '"model":0,"runtime":0,"hardware":0},"execution_summary":{"completed":2,"completed_with_err":0,'
+ '"completed_with_num_err":0,"timed_out":0,"incorrect_input":0,"failed_to_queue":0},"latency_stats":{'
+ '"total_latency":null,"device_latency":null},"error":""},"memory_used":{"period":4.999671285,'
+ '"neuron_runtime_used_bytes":{"host":9043968,"neuron_device":3541303936,"usage_breakdown":{"host":{'
+ '"application_memory":655360,"constants":0,"dma_buffers":8388608,"tensors":0},'
+ '"neuroncore_memory_usage":{"0":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"1":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"2":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"3":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"4":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"5":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"6":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"7":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"8":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"9":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"10":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"11":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"12":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"13":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"14":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"15":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"16":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"17":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"18":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"19":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"20":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"21":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"22":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"23":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"24":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"25":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"26":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"27":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852},"28":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,'
+ '"runtime_memory":0,"tensors":9912852},"29":{"constants":0,"model_code":100752896,'
+ '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"30":{"constants":0,'
+ '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},'
+ '"31":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,'
+ '"tensors":9912852}}}},"loaded_models":[{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10019,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10005,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10007,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10013,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10029,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10032,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10004,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10012,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10001,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10016,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10022,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10024,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10026,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10031,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10025,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10021,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10006,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10011,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10010,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10015,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10008,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10027,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10018,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10030,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10017,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10003,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10002,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10009,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10020,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10028,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10014,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,'
+ '"neuron_device_index":8}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",'
+ '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10023,"is_running":false,"subgraphs":{"sg_00":{'
+ '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{'
+ '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{'
+ '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,'
+ '"neuron_device_index":8}}}],"error":""},"neuron_runtime_vcpu_usage":{"period":4.999647382,'
+ '"vcpu_usage":{"user":0,"system":0},"error":"open/proc/457402/stat:nosuchfileordirectory"},'
+ '"neuroncore_counters":{"period":4.999667932,"neuroncores_in_use":{"0":{"neuroncore_utilization":0},'
+ '"1":{"neuroncore_utilization":0},"2":{"neuroncore_utilization":0},"3":{"neuroncore_utilization":0},'
+ '"4":{"neuroncore_utilization":0},"5":{"neuroncore_utilization":0},"6":{"neuroncore_utilization":0},'
+ '"7":{"neuroncore_utilization":0},"8":{"neuroncore_utilization":0},"9":{"neuroncore_utilization":0},'
+ '"10":{"neuroncore_utilization":0},"11":{"neuroncore_utilization":0},'
+ '"12":{"neuroncore_utilization":0},"13":{"neuroncore_utilization":0},'
+ '"14":{"neuroncore_utilization":0},"15":{"neuroncore_utilization":0},'
+ '"16":{"neuroncore_utilization":0},"17":{"neuroncore_utilization":0},'
+ '"18":{"neuroncore_utilization":0},"19":{"neuroncore_utilization":0},'
+ '"20":{"neuroncore_utilization":0},"21":{"neuroncore_utilization":0},'
+ '"22":{"neuroncore_utilization":0},"23":{"neuroncore_utilization":0},'
+ '"24":{"neuroncore_utilization":0},"25":{"neuroncore_utilization":0},'
+ '"26":{"neuroncore_utilization":0},"27":{"neuroncore_utilization":0},'
+ '"28":{"neuroncore_utilization":0},"29":{"neuroncore_utilization":0},'
+ '"30":{"neuroncore_utilization":0},"31":{"neuroncore_utilization":0}},"error":""}}}],"system_data":{'
+ '"memory_info":{"period":4.9997283150000005,"memory_total_bytes":532523487232,'
+ '"memory_used_bytes":81207975936,"swap_total_bytes":0,"swap_used_bytes":0,"error":""},"vcpu_usage":{'
+ '"period":4.999737702,"average_usage":{"user":19.66,"nice":0,"system":1.67,"idle":78.67,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"usage_data":{"0":{"user":51.31,"nice":0,"system":0,"idle":48.69,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"1":{"user":52.91,"nice":0,"system":6.21,"idle":40.88,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"2":{"user":25.6,"nice":0,"system":0,"idle":74.4,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"3":{"user":0.6,"nice":0,"system":0,"idle":99.4,"io_wait":0,"irq":0,"soft_irq":0},"4":{"user":0.2,'
+ '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"5":{"user":0.8,"nice":0,'
+ '"system":0,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"6":{"user":1,"nice":0,"system":2.99,'
+ '"idle":96.01,"io_wait":0,"irq":0,"soft_irq":0},"7":{"user":0,"nice":0,"system":0.2,"idle":99.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"8":{"user":1.8,"nice":0,"system":0,"idle":98.2,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"9":{"user":0.4,"nice":0,"system":0.8,"idle":98.8,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"10":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"11":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"12":{"user":0,'
+ '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"13":{"user":0,"nice":0,"system":0,'
+ '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"14":{"user":0,"nice":0,"system":0.2,"idle":99.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"15":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"16":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"17":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"18":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"19":{"user":0,'
+ '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"20":{"user":0.4,"nice":0,'
+ '"system":0.4,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"21":{"user":0.2,"nice":0,"system":0.2,'
+ '"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"22":{"user":0.2,"nice":0,"system":0,"idle":99.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"23":{"user":0,"nice":0,"system":0.2,"idle":99.8,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"24":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"25":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"26":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"27":{"user":0.2,"nice":0,"system":0.4,"idle":99.4,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"28":{"user":0.2,"nice":0,"system":0.8,"idle":99,"io_wait":0,"irq":0,"soft_irq":0},"29":{"user":0.2,'
+ '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"30":{"user":0,"nice":0,'
+ '"system":0.4,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"31":{"user":0,"nice":0,"system":0,'
+ '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"32":{"user":0.2,"nice":0,"system":1,"idle":98.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"33":{"user":1.81,"nice":0,"system":4.44,"idle":93.75,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"34":{"user":5.22,"nice":0,"system":0.2,"idle":94.58,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"35":{"user":22,"nice":0,"system":0,"idle":78,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"36":{"user":47.31,"nice":0,"system":2.79,"idle":49.9,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"37":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"38":{"user":5.2,"nice":0,"system":3.8,"idle":91,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"39":{"user":72.69,"nice":0,"system":5.42,"idle":21.89,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"40":{"user":75.85,"nice":0,"system":2.4,"idle":21.76,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"41":{"user":1,"nice":0,"system":3.6,"idle":95.4,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"42":{"user":4.58,"nice":0,"system":3.78,"idle":91.63,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"43":{"user":7.62,"nice":0,"system":5.21,"idle":87.17,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"44":{"user":6.22,"nice":0,"system":2.81,"idle":90.96,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"45":{"user":1.81,"nice":0,"system":4.62,"idle":93.57,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"46":{"user":1.8,"nice":0,"system":6.21,"idle":91.98,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"47":{"user":1.6,"nice":0,"system":5,"idle":93.4,"io_wait":0,"irq":0,"soft_irq":0},"48":{"user":0,'
+ '"nice":0,"system":0.2,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"49":{"user":0.2,"nice":0,'
+ '"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"50":{"user":0,"nice":0,"system":1.4,'
+ '"idle":98.6,"io_wait":0,"irq":0,"soft_irq":0},"51":{"user":76.15,"nice":0,"system":2.81,'
+ '"idle":21.04,"io_wait":0,"irq":0,"soft_irq":0},"52":{"user":8.2,"nice":0,"system":3.4,"idle":88.4,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"53":{"user":8.62,"nice":0,"system":3.61,"idle":87.78,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"54":{"user":7.62,"nice":0,"system":1,"idle":91.38,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"55":{"user":75.3,"nice":0,"system":0.6,"idle":24.1,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"56":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"57":{"user":0,'
+ '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"58":{"user":0,"nice":0,"system":0,'
+ '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"59":{"user":75.2,"nice":0,"system":0.6,"idle":24.2,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"60":{"user":70.46,"nice":0,"system":0,"idle":29.54,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"61":{"user":70.34,"nice":0,"system":0,"idle":29.66,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"62":{"user":72.8,"nice":0,"system":0,"idle":27.2,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"63":{"user":73.2,"nice":0,"system":3,"idle":23.8,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"64":{"user":19.8,"nice":0,"system":1,"idle":79.2,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"65":{"user":0.8,"nice":0,"system":0,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"66":{"user":0.2,'
+ '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"67":{"user":0.2,"nice":0,'
+ '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"68":{"user":6.24,"nice":0,"system":1.61,'
+ '"idle":92.15,"io_wait":0,"irq":0,"soft_irq":0},"69":{"user":0.2,"nice":0,"system":0,"idle":99.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"70":{"user":0.6,"nice":0,"system":2.59,"idle":96.81,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"71":{"user":2.79,"nice":0,"system":5.38,"idle":91.83,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"72":{"user":1.6,"nice":0,"system":6.01,"idle":92.38,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"73":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"74":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"75":{"user":0.2,'
+ '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"76":{"user":0.2,"nice":0,'
+ '"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"77":{"user":2.2,"nice":0,"system":5.4,'
+ '"idle":92.4,"io_wait":0,"irq":0,"soft_irq":0},"78":{"user":1.4,"nice":0,"system":5.01,"idle":93.59,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"79":{"user":0.4,"nice":0,"system":0,"idle":99.6,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"80":{"user":0,"nice":0,"system":0.4,"idle":99.6,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"81":{"user":2,"nice":0,"system":5.59,"idle":92.42,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"82":{"user":2.6,"nice":0,"system":6.4,"idle":91,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"83":{"user":2.79,"nice":0,"system":6.37,"idle":90.84,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"84":{"user":2.2,"nice":0,"system":5.2,"idle":92.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"85":{"user":0.2,"nice":0,"system":1.4,"idle":98.4,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"86":{"user":2.4,"nice":0,"system":5.21,"idle":92.38,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"87":{"user":2.4,"nice":0,"system":6.61,"idle":90.98,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"88":{"user":2.2,"nice":0,"system":6.61,"idle":91.18,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"89":{"user":0,"nice":0,"system":0.4,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"90":{"user":2.4,'
+ '"nice":0,"system":5,"idle":92.6,"io_wait":0,"irq":0,"soft_irq":0},"91":{"user":0.2,"nice":0,'
+ '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"92":{"user":0,"nice":0,"system":0.2,'
+ '"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"93":{"user":0,"nice":0,"system":0,"idle":100,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"94":{"user":0,"nice":0,"system":0.2,"idle":99.8,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"95":{"user":2,"nice":0,"system":5.19,"idle":92.81,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"96":{"user":76.2,"nice":0,"system":6.8,"idle":17,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"97":{"user":75.15,"nice":0,"system":1.6,"idle":23.25,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"98":{"user":1.4,"nice":0,"system":4.81,"idle":93.79,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"99":{"user":1.39,"nice":0,"system":4.78,"idle":93.82,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"100":{"user":2.19,"nice":0,"system":5.38,"idle":92.43,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"101":{"user":75.05,"nice":0,"system":0.6,"idle":24.35,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"102":{"user":77.4,"nice":0,"system":3.8,"idle":18.8,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"103":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"104":{"user":0,'
+ '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"105":{"user":74.95,"nice":0,'
+ '"system":1.2,"idle":23.85,"io_wait":0,"irq":0,"soft_irq":0},"106":{"user":76.8,"nice":0,'
+ '"system":0.4,"idle":22.8,"io_wait":0,"irq":0,"soft_irq":0},"107":{"user":76.95,"nice":0,'
+ '"system":1.2,"idle":21.84,"io_wait":0,"irq":0,"soft_irq":0},"108":{"user":78.71,"nice":0,'
+ '"system":7.43,"idle":13.86,"io_wait":0,"irq":0,"soft_irq":0},"109":{"user":75.05,"nice":0,'
+ '"system":0.4,"idle":24.55,"io_wait":0,"irq":0,"soft_irq":0},"110":{"user":75.15,"nice":0,'
+ '"system":0.4,"idle":24.45,"io_wait":0,"irq":0,"soft_irq":0},"111":{"user":75.15,"nice":0,'
+ '"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},"112":{"user":75.15,"nice":0,'
+ '"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},"113":{"user":74.85,"nice":0,'
+ '"system":1.2,"idle":23.95,"io_wait":0,"irq":0,"soft_irq":0},"114":{"user":74.85,"nice":0,"system":1,'
+ '"idle":24.15,"io_wait":0,"irq":0,"soft_irq":0},"115":{"user":0,"nice":0,"system":0,"idle":100,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"116":{"user":77.84,"nice":0,"system":0.8,"idle":21.36,'
+ '"io_wait":0,"irq":0,"soft_irq":0},"117":{"user":78.2,"nice":0,"system":0.4,"idle":21.4,"io_wait":0,'
+ '"irq":0,"soft_irq":0},"118":{"user":77.8,"nice":0,"system":1,"idle":21.2,"io_wait":0,"irq":0,'
+ '"soft_irq":0},"119":{"user":0.4,"nice":0,"system":0,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"120":{"user":75.2,"nice":0,"system":0.4,"idle":24.4,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"121":{"user":75.15,"nice":0,"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},'
+ '"122":{"user":74.8,"nice":0,"system":1,"idle":24,"io_wait":0,"irq":0,"soft_irq":0.2},'
+ '"123":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"124":{"user":0,'
+ '"nice":0,"system":0.2,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"125":{"user":0.2,"nice":0,'
+ '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"126":{"user":0,"nice":0,"system":0.2,'
+ '"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"127":{"user":1.2,"nice":0,"system":3,"idle":95.8,'
+ '"io_wait":0,"irq":0,"soft_irq":0}},"context_switch_count":171386,"error":""},"neuron_hw_counters":{'
+ '"period":1.000142057,"neuron_devices":[{"neuron_device_index":0,"mem_ecc_corrected":1,'
+ '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":1,'
+ '"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},'
+ '{"neuron_device_index":2,"mem_ecc_corrected":1,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,'
+ '"sram_ecc_corrected":0},{"neuron_device_index":3,"mem_ecc_corrected":2,"mem_ecc_uncorrected":0,'
+ '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":4,"mem_ecc_corrected":0,'
+ '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":5,'
+ '"mem_ecc_corrected":1,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},'
+ '{"neuron_device_index":6,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":0,'
+ '"sram_ecc_corrected":0},{"neuron_device_index":7,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,'
+ '"sram_ecc_uncorrected":1,"sram_ecc_corrected":0},{"neuron_device_index":8,"mem_ecc_corrected":0,'
+ '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":1},{"neuron_device_index":9,'
+ '"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},'
+ '{"neuron_device_index":10,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,'
+ '"sram_ecc_corrected":0},{"neuron_device_index":11,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,'
+ '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":12,"mem_ecc_corrected":0,'
+ '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":1,"sram_ecc_corrected":0},{"neuron_device_index":13,'
+ '"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":1},'
+ '{"neuron_device_index":14,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":1,'
+ '"sram_ecc_corrected":0},{"neuron_device_index":15,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,'
+ '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0}],"error":""}},"instance_info":{'
+ '"instance_name":"DummyNodeName",'
+ '"instance_id":"i-09db9b55e0095612f","instance_type":"trn1n.32xlarge",'
+ '"instance_availability_zone":"us-east-1c","instance_availability_zone_id":"use1-az6",'
+ '"instance_region":"us-east-1","ami_id":"ami-030686a4e905e98d3",'
+ '"subnet_id":"subnet-06a7754948e8a000f","error":""},"neuron_hardware_info":{"neuron_device_count":16,'
+ '"neuroncore_per_device_count":2,"error":""}}')
+ if len(line) == 0:
+ continue
+ if original_file_hash:
+ _watch_file_and_update_ssl_cxt(original_file_hash, certfile=certfile, keyfile=keyfile)
+ try:
+ monitor_data = json.loads(line)
+ except Exception as exc:
+ print('Unable to decode JSON {}'.format(exc))
+ continue
+ if instance_labels is None:
+ instance_labels = get_instance_labels(monitor_data['instance_info'])
+ process_data(all_metric_objects, monitor_data, instance_labels)
+ time.sleep(5)
+
+def main():
+ global ssl_cxt
+ arg_parser = argparse.ArgumentParser()
+ arg_parser.add_argument('-p', '--port', default=8000,
+ type=int, help='HTTP port on which to run the server')
+ arg_parser.add_argument('--key-file', help='Path to SSL private key file (only for HTTPS)')
+ arg_parser.add_argument('--cert-file', help='Path to SSL certificate file (only for HTTPS)')
+ args = arg_parser.parse_args()
+
+ if args.key_file and args.cert_file:
+ if sys.version_info < (3, 8):
+ print("""Python version 3.8 or greater is requried for https/tls support.
+ Also upgrade your prometheus_client version to 0.19.0 or greater if required
+ https://github.com/prometheus/client_python/releases""")
+ sys.exit(1)
+ httpd, _t = start_http_server(port=args.port, keyfile=args.key_file, certfile=args.cert_file)
+ ssl_cxt = httpd.socket.context
+ print("Running HTTPS prometheus server at port {}".format(args.port))
+ else:
+ start_http_server(port=args.port)
+ print("Running HTTP prometheus server at port {}".format(args.port))
+
+ update_loop(certfile = args.cert_file or None, keyfile=args.key_file or None)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go
index f7f042267..a0021e438 100644
--- a/generator/test_case_generator.go
+++ b/generator/test_case_generator.go
@@ -223,6 +223,10 @@ var testTypeToTestConfig = map[string][]testConfig{
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
+ {
+ testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
+ targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
+ },
},
"eks_deployment": {
{testDir: "./test/metric_value_benchmark"},
diff --git a/terraform/eks/daemon/awsneuron/main.tf b/terraform/eks/daemon/awsneuron/main.tf
new file mode 100644
index 000000000..87ea02b0a
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/main.tf
@@ -0,0 +1,826 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+ source = "../../../common"
+ cwagent_image_repo = var.cwagent_image_repo
+ cwagent_image_tag = var.cwagent_image_tag
+}
+
+module "basic_components" {
+ source = "../../../basic_components"
+
+ region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+ name = "cwagent-eks-integ-${module.common.testing_id}"
+ role_arn = module.basic_components.role_arn
+ version = var.k8s_version
+ enabled_cluster_log_types = [
+ "api",
+ "audit",
+ "authenticator",
+ "controllerManager",
+ "scheduler"
+ ]
+ vpc_config {
+ subnet_ids = module.basic_components.public_subnet_ids
+ security_group_ids = [module.basic_components.security_group]
+ }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+ cluster_name = aws_eks_cluster.this.name
+ node_group_name = "cwagent-eks-integ-node"
+ node_role_arn = aws_iam_role.node_role.arn
+ subnet_ids = module.basic_components.public_subnet_ids
+
+ scaling_config {
+ desired_size = 1
+ max_size = 1
+ min_size = 1
+ }
+
+ ami_type = "AL2_x86_64"
+ capacity_type = "ON_DEMAND"
+ disk_size = 20
+ instance_types = ["t3.medium"]
+
+ depends_on = [
+ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+ aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+ aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+ ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+ name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Effect = "Allow",
+ Principal = {
+ Service = "ec2.amazonaws.com"
+ },
+ Action = "sts:AssumeRole"
+ }
+ ]
+ })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+ role = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+ name = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+ description = "Cluster communication with worker nodes"
+ vpc_id = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+ description = "Allow worker nodes to communicate with the cluster API Server"
+ from_port = 443
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 443
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+ description = "Allow cluster API Server to communicate with the worker nodes"
+ from_port = 1024
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+ name = "cwagent-eks-node-sg-${module.common.testing_id}"
+ description = "Security group for all nodes in the cluster"
+ vpc_id = module.basic_components.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+ description = "Allow nodes to communicate with each other"
+ from_port = 0
+ protocol = "-1"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+ description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+ from_port = 1025
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_cluster_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+
+# create cert for communication between agent and neuron monitor
+resource "tls_private_key" "private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+ content = tls_private_key.private_key.private_key_pem
+ filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+ private_key_pem = tls_private_key.private_key.private_key_pem
+ is_ca_certificate = true
+ subject {
+ common_name = "neuron-monitor-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+ validity_period_hours = 24
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "cert_signing",
+ "crl_signing",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "ca_cert_file" {
+ content = tls_self_signed_cert.ca_cert.cert_pem
+ filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+ content = tls_private_key.server_private_key.private_key_pem
+ filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+ private_key_pem = tls_private_key.server_private_key.private_key_pem
+ dns_names = ["localhost", "127.0.0.1", "neuron-monitor-service.amazon-cloudwatch.svc"]
+ subject {
+ common_name = "neuron-monitor-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+ cert_request_pem = tls_cert_request.local_csr.cert_request_pem
+ ca_private_key_pem = tls_private_key.private_key.private_key_pem
+ ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem
+ validity_period_hours = 12
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "server_cert_file" {
+ content = tls_locally_signed_cert.server_cert.cert_pem
+ filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+ metadata {
+ name = "amazon-cloudwatch-observability-agent-cert"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename)
+ "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename)
+ "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+ }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+ metadata {
+ name = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_config_map" "neuron_monitor_config_map" {
+ depends_on = [
+ kubernetes_namespace.namespace
+ ]
+
+ metadata {
+ name = "neuron-monitor-config-map"
+ namespace = "amazon-cloudwatch"
+ }
+
+ data = {
+ "monitor.json" = jsonencode({
+ period = "5s"
+ neuron_runtimes = [
+ {
+ tag_filter : ".*"
+ metrics = [
+ {
+ type = "neuroncore_counters"
+ },
+ {
+ type = "memory_used"
+ },
+ {
+ type = "neuron_runtime_vcpu_usage"
+ },
+ {
+ type = "execution_stats"
+ }
+ ]
+ }
+ ]
+ system_metrics = [
+ {
+ type = "memory_info"
+ },
+ {
+ period = "5s"
+ type = "neuron_hw_counters"
+ }
+ ]
+ })
+ }
+}
+
+resource "kubernetes_service_account" "neuron_monitor_service_account" {
+ depends_on = [
+ kubernetes_namespace.namespace
+ ]
+ metadata {
+ name = "neuron-monitor-service-acct"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_role" "neuron_monitor_role" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_config_map.neuron_monitor_config_map
+ ]
+ metadata {
+ name = "neuron-monitor-role"
+ namespace = "amazon-cloudwatch"
+ }
+
+ rule {
+ api_groups = [""]
+ resources = ["configmaps"]
+ resource_names = ["neuron-monitor-config-map"]
+ verbs = ["get"]
+ }
+}
+
+resource "kubernetes_role_binding" "neuron_monitor_role_binding" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_role.neuron_monitor_role
+ ]
+
+ metadata {
+ namespace = "amazon-cloudwatch"
+ name = "neuron-monitor-role-binding"
+ }
+
+ role_ref {
+ kind = "Role"
+ name = "neuron-monitor-role"
+ api_group = "rbac.authorization.k8s.io"
+ }
+
+ subject {
+ kind = "ServiceAccount"
+ name = "neuron-monitor-service-acct"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_daemonset" "neuron_monitor" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_role.neuron_monitor_role,
+ kubernetes_role_binding.neuron_monitor_role_binding,
+ kubernetes_config_map.neuron_monitor_config_map
+ ]
+
+ metadata {
+ name = "neuron-monitor"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ k8s-app = "neuron-monitor"
+ version = "v1"
+ }
+ }
+ spec {
+ selector {
+ match_labels = {
+ k8s-app = "neuron-monitor"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ k8s-app = "neuron-monitor"
+ version = "v1"
+ }
+ }
+ spec {
+ affinity {
+ node_affinity {
+ required_during_scheduling_ignored_during_execution {
+ node_selector_term {
+ match_expressions {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ }
+ }
+ }
+ }
+ }
+ container {
+ name = "neuron-monitor-prometheus"
+ image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:v2"
+ port {
+ container_port = 8000
+ }
+ command = [
+ "/bin/sh",
+ "-c",
+ "/opt/aws/neuron/bin/dummy_neuron_monitor.py --port 8000 --cert-file /etc/amazon-cloudwatch-observability-neuron-cert/server.crt --key-file /etc/amazon-cloudwatch-observability-neuron-cert/server.key"
+ ]
+ resources {
+ limits = {
+ cpu = "500m"
+ memory = "256Mi"
+ }
+ requests = {
+ cpu = "256m"
+ memory = "128Mi"
+ }
+ }
+ security_context {
+ privileged = true
+ }
+ env {
+ name = "NODE_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "PATH"
+ value = "/usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin"
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-neuron-cert/"
+ name = "neurontls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/neuron-monitor-config/"
+ name = "neuron-monitor-config"
+ read_only = true
+ }
+ }
+ volume {
+ name = "neurontls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "tls.crt"
+ path = "server.crt"
+ }
+ items {
+ key = "tls.key"
+ path = "server.key"
+ }
+ }
+ }
+ volume {
+ name = "neuron-monitor-config"
+ config_map {
+ name = "neuron-monitor-config-map"
+ }
+ }
+ service_account_name = "neuron-monitor-service-acct"
+ }
+ }
+ }
+}
+
+resource "kubernetes_service" "neuron_monitor_service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.neuron_monitor
+ ]
+ metadata {
+ name = "neuron-monitor-service"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ "k8s-app" : "neuron-monitor-service"
+ }
+ annotations = {
+ "prometheus.io/scrape" : "true"
+ }
+ }
+ spec {
+ type = "ClusterIP"
+ selector = {
+ k8s-app = "neuron-monitor"
+ }
+ port {
+ name = "metrics"
+ port = 8000
+ target_port = 8000
+ protocol = "TCP"
+ }
+ internal_traffic_policy = "Local"
+ }
+}
+
+resource "kubernetes_daemonset" "service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.neuron_monitor
+ ]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+ spec {
+ selector {
+ match_labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "cwagent"
+ image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+ image_pull_policy = "Always"
+ resources {
+ limits = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ requests = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ }
+ port {
+ container_port = 25888
+ host_port = 25888
+ protocol = "UDP"
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ volume_mount {
+ mount_path = "/etc/cwagentconfig"
+ name = "cwagentconfig"
+ }
+ volume_mount {
+ mount_path = "/rootfs"
+ name = "rootfs"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/run/docker.sock"
+ name = "dockersock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/docker"
+ name = "varlibdocker"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/run/containerd/containerd.sock"
+ name = "containerdsock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/sys"
+ name = "sys"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/dev/disk"
+ name = "devdisk"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+ name = "agenttls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/kubelet/pod-resources"
+ name = "kubelet-podresources"
+ read_only = true
+ }
+ }
+ volume {
+ name = "cwagentconfig"
+ config_map {
+ name = "cwagentconfig"
+ }
+ }
+ volume {
+ name = "rootfs"
+ host_path {
+ path = "/"
+ }
+ }
+ volume {
+ name = "dockersock"
+ host_path {
+ path = "/var/run/docker.sock"
+ }
+ }
+ volume {
+ name = "varlibdocker"
+ host_path {
+ path = "/var/lib/docker"
+ }
+ }
+ volume {
+ name = "containerdsock"
+ host_path {
+ path = "/run/containerd/containerd.sock"
+ }
+ }
+ volume {
+ name = "sys"
+ host_path {
+ path = "/sys"
+ }
+ }
+ volume {
+ name = "devdisk"
+ host_path {
+ path = "/dev/disk"
+ }
+ }
+ volume {
+ name = "kubelet-podresources"
+ host_path {
+ path = "/var/lib/kubelet/pod-resources"
+ }
+ }
+ volume {
+ name = "agenttls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "ca.crt"
+ path = "tls-ca.crt"
+ }
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+ httpd_config = "../../../../${var.test_dir}/resources/httpd.conf"
+ httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+ cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+ template = file(local.cwagent_config)
+ vars = {
+ }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "cwagentconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+ }
+}
+
+data "template_file" "httpd_config" {
+ template = file(local.httpd_config)
+ vars = {}
+}
+data "template_file" "httpd_ssl_config" {
+ template = file(local.httpd_ssl_config)
+ vars = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "httpdconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "httpd.conf" : data.template_file.httpd_config.rendered
+ "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+ }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role"
+ }
+ rule {
+ verbs = ["get", "list", "watch"]
+ resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["replicasets"]
+ api_groups = ["apps"]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["jobs"]
+ api_groups = ["batch"]
+ }
+ rule {
+ verbs = ["get"]
+ resources = ["nodes/proxy"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["create"]
+ resources = ["nodes/stats", "configmaps", "events"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get", "update"]
+ resource_names = ["cwagent-clusterleader"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get"]
+ resource_names = ["neuron-monitor-config-map"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["services"]
+ api_groups = [""]
+ }
+ rule {
+ non_resource_urls = ["/metrics"]
+ verbs = ["get", "list", "watch"]
+ }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role-binding"
+ }
+ role_ref {
+ api_group = "rbac.authorization.k8s.io"
+ kind = "ClusterRole"
+ name = "cloudwatch-agent-role"
+ }
+ subject {
+ kind = "ServiceAccount"
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "null_resource" "validator" {
+ depends_on = [
+ aws_eks_node_group.this,
+ kubernetes_daemonset.service,
+ kubernetes_cluster_role_binding.rolebinding,
+ kubernetes_service_account.cwagentservice,
+ ]
+ provisioner "local-exec" {
+ command = <<-EOT
+ echo "Validating EKS metrics/logs for AWS Neuron"
+ cd ../../../..
+ go test -timeout 30m ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON
+ EOT
+ }
+}
diff --git a/terraform/eks/daemon/awsneuron/providers.tf b/terraform/eks/daemon/awsneuron/providers.tf
new file mode 100644
index 000000000..9bd2885f5
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/providers.tf
@@ -0,0 +1,17 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+provider "aws" {
+ region = var.region
+}
+
+provider "kubernetes" {
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ command = "aws"
+ args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
+ }
+ host = aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.this.token
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/awsneuron/variables.tf b/terraform/eks/daemon/awsneuron/variables.tf
new file mode 100644
index 000000000..4cb2a7904
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/variables.tf
@@ -0,0 +1,28 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+variable "region" {
+ type = string
+ default = "us-west-2"
+}
+
+variable "test_dir" {
+ type = string
+ default = "./test/awsneuron"
+}
+
+variable "cwagent_image_repo" {
+ type = string
+ default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
+}
+
+variable "cwagent_image_tag" {
+ type = string
+ default = "latest"
+}
+
+variable "k8s_version" {
+ type = string
+ default = "1.28"
+}
+
diff --git a/test/awsneuron/neuron_metrics_test.go b/test/awsneuron/neuron_metrics_test.go
new file mode 100644
index 000000000..da9ac577f
--- /dev/null
+++ b/test/awsneuron/neuron_metrics_test.go
@@ -0,0 +1,63 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package awsneuron
+
+import (
+ "time"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ . "github.com/aws/amazon-cloudwatch-agent-test/test/awsneuron/resources"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+const (
+ awsNeuronMetricIndicator = "_neuron"
+)
+
+var expectedDimsToMetrics = map[string][]string{
+ "ClusterName": {
+ NodeNeuronCoreUtil, NodeNeuronCoreMemUsageConstants, NodeNeuronCoreMemUsageModel, NodeNeuronCoreMemUsageScratchpad,
+ NodeNeuronCoreMemUsageRuntime, NodeNeuronCoreMemUsageTensors, NodeNeuronCoreMemUsageTotal, NodeNeuronDeviceHwEccEvents,
+ NodeExecutionErrorsTotal, NodeNeuronDeviceRuntimeMemoryUsed, NodeNeuronExecutionLatency,
+ },
+}
+
+type AwsNeuronTestRunner struct {
+ test_runner.BaseTestRunner
+ testName string
+ env *environment.MetaData
+}
+
+var _ test_runner.ITestRunner = (*AwsNeuronTestRunner)(nil)
+
+func (t *AwsNeuronTestRunner) Validate() status.TestGroupResult {
+ var testResults []status.TestResult
+ testResults = append(testResults, metric.ValidateMetrics(t.env, awsNeuronMetricIndicator, expectedDimsToMetrics)...)
+ testResults = append(testResults, metric.ValidateLogs(t.env))
+ testResults = append(testResults, metric.ValidateLogsFrequency(t.env))
+ return status.TestGroupResult{
+ Name: t.GetTestName(),
+ TestResults: testResults,
+ }
+}
+
+func (t *AwsNeuronTestRunner) GetTestName() string {
+ return t.testName
+}
+
+func (t *AwsNeuronTestRunner) GetAgentConfigFileName() string {
+ return ""
+}
+
+func (t *AwsNeuronTestRunner) GetAgentRunDuration() time.Duration {
+ return 25 * time.Minute
+}
+
+func (t *AwsNeuronTestRunner) GetMeasuredMetrics() []string {
+ return nil
+}
diff --git a/test/awsneuron/neuron_test.go b/test/awsneuron/neuron_test.go
new file mode 100644
index 000000000..6ccc4f599
--- /dev/null
+++ b/test/awsneuron/neuron_test.go
@@ -0,0 +1,77 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package awsneuron
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/suite"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+type AwsNeuronTestSuite struct {
+ suite.Suite
+ test_runner.TestSuite
+}
+
+func (suite *AwsNeuronTestSuite) SetupSuite() {
+ fmt.Println(">>>> Starting AWS Neuron Container Insights TestSuite")
+}
+
+func (suite *AwsNeuronTestSuite) TearDownSuite() {
+ suite.Result.Print()
+ fmt.Println(">>>> Finished AWS Neuron Container Insights TestSuite")
+}
+
+func init() {
+ environment.RegisterEnvironmentMetaDataFlags()
+}
+
+var (
+ eksTestRunners []*test_runner.EKSTestRunner
+)
+
+func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner {
+ if eksTestRunners == nil {
+ factory := dimension.GetDimensionFactory(*env)
+
+ eksTestRunners = []*test_runner.EKSTestRunner{
+ {
+ Runner: &AwsNeuronTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_AWS_NEURON", env},
+ Env: *env,
+ },
+ }
+ }
+ return eksTestRunners
+}
+
+func (suite *AwsNeuronTestSuite) TestAllInSuite() {
+ env := environment.GetEnvironmentMetaData()
+ switch env.ComputeType {
+ case computetype.EKS:
+ for _, testRunner := range getEksTestRunners(env) {
+ testRunner.Run(suite, env)
+ }
+ default:
+ return
+ }
+
+ suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "AWS Neuron Container Test Suite Failed")
+}
+
+func (suite *AwsNeuronTestSuite) AddToSuiteResult(r status.TestGroupResult) {
+ suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r)
+}
+
+func TestAWSNeuronSuite(t *testing.T) {
+ suite.Run(t, new(AwsNeuronTestSuite))
+}
diff --git a/test/awsneuron/resources/config.json b/test/awsneuron/resources/config.json
new file mode 100644
index 000000000..6f37e43ed
--- /dev/null
+++ b/test/awsneuron/resources/config.json
@@ -0,0 +1,16 @@
+{
+ "agent": {
+ "metrics_collection_interval": 15,
+ "run_as_user": "root",
+ "debug": true,
+ "logfile": ""
+ },
+ "logs": {
+ "metrics_collected": {
+ "kubernetes": {
+ "enhanced_container_insights": true
+ }
+ },
+ "force_flush_interval": 5
+ }
+}
\ No newline at end of file
diff --git a/test/awsneuron/resources/httpd-ssl.conf b/test/awsneuron/resources/httpd-ssl.conf
new file mode 100644
index 000000000..18c33f0bd
--- /dev/null
+++ b/test/awsneuron/resources/httpd-ssl.conf
@@ -0,0 +1,43 @@
+Listen 8000
+
+SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+
+SSLHonorCipherOrder on
+
+SSLProtocol all -SSLv3
+SSLProxyProtocol all -SSLv3
+
+SSLPassPhraseDialog builtin
+
+SSLSessionCache "shmcb:/usr/local/apache2/logs/ssl_scache(512000)"
+SSLSessionCacheTimeout 300
+
+
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+ServerName neuron-monitor-service.amazon-cloudwatch.svc:8000
+ServerAdmin you@example.com
+ErrorLog /proc/self/fd/2
+TransferLog /proc/self/fd/1
+
+SSLEngine on
+SSLCertificateFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt"
+SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.key"
+
+
+ SSLOptions +StdEnvVars
+
+
+ SSLOptions +StdEnvVars
+
+
+BrowserMatch "MSIE [2-5]" \
+ nokeepalive ssl-unclean-shutdown \
+ downgrade-1.0 force-response-1.0
+
+CustomLog /proc/self/fd/1 \
+ "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b"
+
+
\ No newline at end of file
diff --git a/test/awsneuron/resources/httpd.conf b/test/awsneuron/resources/httpd.conf
new file mode 100644
index 000000000..122b16b17
--- /dev/null
+++ b/test/awsneuron/resources/httpd.conf
@@ -0,0 +1,101 @@
+
+ServerRoot "/usr/local/apache2"
+
+#Listen 8000
+
+LoadModule mpm_event_module modules/mod_mpm_event.so
+LoadModule authn_file_module modules/mod_authn_file.so
+LoadModule authn_core_module modules/mod_authn_core.so
+LoadModule authz_host_module modules/mod_authz_host.so
+LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+LoadModule authz_user_module modules/mod_authz_user.so
+LoadModule authz_core_module modules/mod_authz_core.so
+LoadModule access_compat_module modules/mod_access_compat.so
+LoadModule auth_basic_module modules/mod_auth_basic.so
+LoadModule socache_shmcb_module modules/mod_socache_shmcb.so
+LoadModule reqtimeout_module modules/mod_reqtimeout.so
+LoadModule filter_module modules/mod_filter.so
+LoadModule mime_module modules/mod_mime.so
+LoadModule log_config_module modules/mod_log_config.so
+LoadModule env_module modules/mod_env.so
+LoadModule headers_module modules/mod_headers.so
+LoadModule setenvif_module modules/mod_setenvif.so
+LoadModule version_module modules/mod_version.so
+LoadModule ssl_module modules/mod_ssl.so
+LoadModule unixd_module modules/mod_unixd.so
+LoadModule status_module modules/mod_status.so
+LoadModule autoindex_module modules/mod_autoindex.so
+LoadModule dir_module modules/mod_dir.so
+LoadModule alias_module modules/mod_alias.so
+
+
+User www-data
+Group www-data
+
+
+
+ AllowOverride none
+ Require all denied
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+
+ Options Indexes FollowSymLinks
+ AllowOverride None
+ Require all granted
+
+
+
+ DirectoryIndex index.html
+
+
+
+ Require all denied
+
+
+ErrorLog /proc/self/fd/2
+
+LogLevel warn
+
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined
+ LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+
+ # You need to enable mod_logio.c to use %I and %O
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio
+
+
+ CustomLog /proc/self/fd/1 common
+
+
+
+ ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/"
+
+
+
+ AllowOverride None
+ Options None
+ Require all granted
+
+
+
+ RequestHeader unset Proxy early
+
+
+
+ TypesConfig conf/mime.types
+ AddType application/x-compress .Z
+ AddType application/x-gzip .gz .tgz
+
+
+
+Include conf/extra/proxy-html.conf
+
+
+# Secure (SSL/TLS) connections
+Include conf/extra/httpd-ssl.conf
+
+SSLRandomSeed startup builtin
+SSLRandomSeed connect builtin
+
\ No newline at end of file
diff --git a/test/awsneuron/resources/metrics_list.go b/test/awsneuron/resources/metrics_list.go
new file mode 100644
index 000000000..ce4094597
--- /dev/null
+++ b/test/awsneuron/resources/metrics_list.go
@@ -0,0 +1,48 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package resources
+
+const (
+ ContainerNeuronCoreUtil = "container_neuroncore_utilization"
+ ContainerNeuronCoreMemUsageConstants = "container_neuroncore_memory_usage_constants"
+ ContainerNeuronCoreMemUsageModel = "container_neuroncore_memory_usage_model_code"
+ ContainerNeuronCoreMemUsageScratchpad = "container_neuroncore_memory_usage_model_shared_scratchpad"
+ ContainerNeuronCoreMemUsageRuntime = "container_neuroncore_memory_usage_runtime_memory"
+ ContainerNeuronCoreMemUsageTensors = "container_neuroncore_memory_usage_tensors"
+ ContainerNeuronCoreMemUsageTotal = "container_neuroncore_memory_usage_total"
+ ContainerNeuronDeviceHwEccEvents = "container_neurondevice_hw_ecc_events_total"
+
+ PodNeuronCoreUtil = "pod_neuroncore_utilization"
+ PodNeuronCoreMemUsageConstants = "pod_neuroncore_memory_usage_constants"
+ PodNeuronCoreMemUsageModel = "pod_neuroncore_memory_usage_model_code"
+ PodNeuronCoreMemUsageScratchpad = "pod_neuroncore_memory_usage_model_shared_scratchpad"
+ PodNeuronCoreMemUsageRuntime = "pod_neuroncore_memory_usage_runtime_memory"
+ PodNeuronCoreMemUsageTensors = "pod_neuroncore_memory_usage_tensors"
+ PodNeuronCoreMemUsageTotal = "pod_neuroncore_memory_usage_total"
+ PodNeuronDeviceHwEccEvents = "pod_neurondevice_hw_ecc_events_total"
+
+ NodeNeuronCoreUtil = "node_neuroncore_utilization"
+ NodeNeuronCoreMemUsageConstants = "node_neuroncore_memory_usage_constants"
+ NodeNeuronCoreMemUsageModel = "node_neuroncore_memory_usage_model_code"
+ NodeNeuronCoreMemUsageScratchpad = "node_neuroncore_memory_usage_model_shared_scratchpad"
+ NodeNeuronCoreMemUsageRuntime = "node_neuroncore_memory_usage_runtime_memory"
+ NodeNeuronCoreMemUsageTensors = "node_neuroncore_memory_usage_tensors"
+ NodeNeuronCoreMemUsageTotal = "node_neuroncore_memory_usage_total"
+ NodeNeuronDeviceHwEccEvents = "node_neurondevice_hw_ecc_events_total"
+ NodeExecutionErrorsTotal = "node_neuron_execution_errors_total"
+ NodeExecutionErrorsGeneric = "node_neuron_execution_errors_generic"
+ NodeExecutionErrorsNumerical = "node_neuron_execution_errors_numerical"
+ NodeExecutionErrorsTransient = "node_neuron_execution_errors_transient"
+ NodeExecutionErrorsModel = "node_neuron_execution_errors_model"
+ NodeExecutionErrorsRuntime = "node_neuron_execution_errors_runtime"
+ NodeExecutionErrorsHardware = "node_neuron_execution_errors_hardware"
+ NodeExecutionStatusCompleted = "node_neuron_execution_status_completed"
+ NodeExecutionStatusTimedOut = "node_neuron_execution_status_timed_out"
+ NodeExecutionStatusCompletedWithErr = "node_neuron_execution_status_completed_with_err"
+ NodeExecutionStatusCompletedWithNumErr = "node_neuron_execution_status_completed_with_num_err"
+ NodeExecutionStatusIncorrectInput = "node_neuron_execution_status_incorrect_input"
+ NodeExecutionStatusFailedToQueue = "node_neuron_execution_status_failed_to_queue"
+ NodeNeuronDeviceRuntimeMemoryUsed = "node_neurondevice_runtime_memory_used_bytes"
+ NodeNeuronExecutionLatency = "node_neuron_execution_latency"
+)
diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go
index ba4ed33bc..a61a3daf5 100644
--- a/test/metric/container_insights_util.go
+++ b/test/metric/container_insights_util.go
@@ -228,3 +228,49 @@ func ValidateLogs(env *environment.MetaData) status.TestResult {
testResult.Status = status.SUCCESSFUL
return testResult
}
+
+func ValidateLogsFrequency(env *environment.MetaData) status.TestResult {
+
+ testResult := status.TestResult{
+ Name: "emf-logs-frequency",
+ Status: status.FAILED,
+ }
+
+ end := time.Now().Add(time.Duration(-2) * time.Minute).Truncate(time.Minute)
+ start := end.Add(time.Duration(-1) * time.Minute)
+ group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
+
+ // need to get the instances used for the EKS cluster
+ eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
+ if err != nil {
+ log.Println("failed to get EKS instances", err)
+ return testResult
+ }
+
+ for _, instance := range eKSInstances {
+ stream := *instance.InstanceName
+ frequencyMap, err := awsservice.GetLogEventCountPerType(group, stream, &start, &end)
+
+ for logType, expectedFrequency := range eks_resources.EksClusterFrequencyValidationMap {
+ log.Printf("logs with no logtype : %d", frequencyMap[awsservice.NoLogTypeFound])
+
+ actualFrequency, ok := frequencyMap[logType]
+ if !ok {
+ log.Printf("no log with the expected logtype found : %s, start time : %s", logType, start.GoString())
+ return testResult
+ }
+ if actualFrequency != expectedFrequency {
+ log.Printf("log frequency validation failed for type: %s, expected: %d, actual: %d, start time: %s", logType, expectedFrequency, actualFrequency, start.GoString())
+ return testResult
+ }
+ }
+
+ if err != nil {
+ log.Printf("log validation (%s/%s) failed: %v, start time : %s", group, stream, err, start)
+ return testResult
+ }
+ }
+
+ testResult.Status = status.SUCCESSFUL
+ return testResult
+}
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json
new file mode 100644
index 000000000..c253fee52
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json
@@ -0,0 +1,50 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "ContainerName": {},
+ "FullPodName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "K8sPodName": {},
+ "NeuronCore": {},
+ "NeuronDevice": {},
+ "NodeName": {},
+ "PodName": {},
+ "Service": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "kubernetes": {},
+ "region": {},
+ "subnet_id": {},
+ "container_neuroncore_memory_usage_constants": {},
+ "container_neuroncore_memory_usage_model_code": {},
+ "container_neuroncore_memory_usage_model_shared_scratchpad": {},
+ "container_neuroncore_memory_usage_runtime_memory": {},
+ "container_neuroncore_memory_usage_tensors": {},
+ "container_neuroncore_memory_usage_total": {},
+ "container_neuroncore_utilization": {}
+ },
+ "required": [
+ "ClusterName",
+ "ContainerName",
+ "FullPodName",
+ "InstanceId",
+ "InstanceType",
+ "NeuronCore",
+ "NeuronDevice",
+ "NodeName",
+ "PodName",
+ "Service",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json
new file mode 100644
index 000000000..b6c30b2da
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json
@@ -0,0 +1,33 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "NodeName": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "kubernetes": {},
+ "region": {},
+ "subnet_id": {},
+ "node_neuron_execution_errors_total": {},
+ "node_neuron_execution_latency": {},
+ "node_neurondevice_runtime_memory_used_bytes": {}
+ },
+ "required": [
+ "ClusterName",
+ "InstanceId",
+ "InstanceType",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json
new file mode 100644
index 000000000..3e9a04319
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json
@@ -0,0 +1,42 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "Namespace": {},
+ "NeuronCore": {},
+ "NeuronDevice": {},
+ "NodeName": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "kubernetes": {},
+ "region": {},
+ "subnet_id": {},
+ "node_neuroncore_memory_usage_constants": {},
+ "node_neuroncore_memory_usage_model_code": {},
+ "node_neuroncore_memory_usage_model_shared_scratchpad": {},
+ "node_neuroncore_memory_usage_runtime_memory": {},
+ "node_neuroncore_memory_usage_tensors": {},
+ "node_neuroncore_memory_usage_total": {},
+ "node_neuroncore_utilization": {}
+ },
+ "required": [
+ "ClusterName",
+ "InstanceId",
+ "InstanceType",
+ "NeuronCore",
+ "NeuronDevice",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json
new file mode 100644
index 000000000..30b642531
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json
@@ -0,0 +1,34 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "NeuronDevice": {},
+ "NodeName": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "kubernetes": {},
+ "node_neurondevice_hw_ecc_events_mem_ecc_corrected": {},
+ "region": {},
+ "subnet_id": {},
+ "node_neurondevice_hw_ecc_events_total": {}
+ },
+ "required": [
+ "ClusterName",
+ "InstanceId",
+ "InstanceType",
+ "NeuronDevice",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json
new file mode 100644
index 000000000..9ba632799
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json
@@ -0,0 +1,49 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "FullPodName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "K8sPodName": {},
+ "NeuronCore": {},
+ "NeuronDevice": {},
+ "NodeName": {},
+ "PodName": {},
+ "Service": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "kubernetes": {},
+ "region": {},
+ "subnet_id": {},
+ "pod_neuroncore_memory_usage_constants": {},
+ "pod_neuroncore_memory_usage_model_code": {},
+ "pod_neuroncore_memory_usage_model_shared_scratchpad": {},
+ "pod_neuroncore_memory_usage_runtime_memory": {},
+ "pod_neuroncore_memory_usage_tensors": {},
+ "pod_neuroncore_memory_usage_total": {},
+ "pod_neuroncore_utilization": {}
+ },
+ "required": [
+ "ClusterName",
+ "ContainerName",
+ "FullPodName",
+ "InstanceId",
+ "InstanceType",
+ "NeuronCore",
+ "NeuronDevice",
+ "NodeName",
+ "PodName",
+ "Service",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go
index 3ea59a1a0..bd737f85b 100644
--- a/test/metric_value_benchmark/eks_resources/util.go
+++ b/test/metric_value_benchmark/eks_resources/util.go
@@ -46,26 +46,47 @@ var (
eksNodeGpuSchema string
//go:embed test_schemas/cluster_gpu.json
eksClusterGpuSchema string
+ //go:embed test_schemas/container_neuroncore.json
+ eksContainerNeuronCoreSchema string
+ //go:embed test_schemas/pod_neuroncore.json
+ eksPodNeuronCoreSchema string
+ //go:embed test_schemas/node_neuroncore.json
+ eksNodeNeuronCoreSchema string
+ //go:embed test_schemas/node_neurondevice.json
+ eksNodeNeuronDeviceSchema string
+ //go:embed test_schemas/node_neuron.json
+ eksNodeNeuronSchema string
EksClusterValidationMap = map[string]string{
- "Cluster": eksClusterSchema,
- "ClusterDaemonSet": eksClusterDaemonsetSchema,
- "ClusterDeployment": eksClusterDeploymentSchema,
- "ClusterNamespace": eksClusterNamespaceSchema,
- "ClusterService": eksClusterServiceSchema,
- "Container": eksContainerSchema,
- "ContainerFS": eksContainerFSSchema,
- "ControlPlane": eksControlPlaneSchema,
- "Node": eksNodeSchema,
- "NodeDiskIO": eksNodeDiskIOSchema,
- "NodeFS": eksNodeFSSchema,
- "NodeNet": eksNodeNetSchema,
- "Pod": eksPodSchema,
- "PodNet": eksPodNetSchema,
- "ContainerGPU": eksContainerGpuSchema,
- "PodGPU": eksPodGpuSchema,
- "NodeGPU": eksNodeGpuSchema,
- "ClusterGPU": eksClusterGpuSchema,
+ "Cluster": eksClusterSchema,
+ "ClusterDaemonSet": eksClusterDaemonsetSchema,
+ "ClusterDeployment": eksClusterDeploymentSchema,
+ "ClusterNamespace": eksClusterNamespaceSchema,
+ "ClusterService": eksClusterServiceSchema,
+ "Container": eksContainerSchema,
+ "ContainerFS": eksContainerFSSchema,
+ "ControlPlane": eksControlPlaneSchema,
+ "Node": eksNodeSchema,
+ "NodeDiskIO": eksNodeDiskIOSchema,
+ "NodeFS": eksNodeFSSchema,
+ "NodeNet": eksNodeNetSchema,
+ "Pod": eksPodSchema,
+ "PodNet": eksPodNetSchema,
+ "ContainerGPU": eksContainerGpuSchema,
+ "PodGPU": eksPodGpuSchema,
+ "NodeGPU": eksNodeGpuSchema,
+ "ClusterGPU": eksClusterGpuSchema,
+ "ContainerAWSNeuronCore": eksContainerNeuronCoreSchema,
+ "PodAWSNeuronCore": eksPodNeuronCoreSchema,
+ "NodeAWSNeuronCore": eksNodeNeuronCoreSchema,
+ "NodeAWSNeuronDevice": eksNodeNeuronDeviceSchema,
+ "NodeAWSNeuron": eksNodeNeuronSchema,
+ }
+
+ EksClusterFrequencyValidationMap = map[string]int{
+ "NodeAWSNeuronCore": 32,
+ "NodeAWSNeuronDevice": 16,
+ "NodeAWSNeuron": 1,
}
)
diff --git a/util/awsservice/cloudwatchlogs.go b/util/awsservice/cloudwatchlogs.go
index 10a9f830b..c801b7880 100644
--- a/util/awsservice/cloudwatchlogs.go
+++ b/util/awsservice/cloudwatchlogs.go
@@ -5,6 +5,7 @@ package awsservice
import (
"context"
+ "encoding/json"
"errors"
"fmt"
"log"
@@ -20,6 +21,7 @@ import (
const (
logStreamRetry = 20
retryInterval = 10 * time.Second
+ NoLogTypeFound = "NoLogTypeFound"
)
// catch ResourceNotFoundException when deleting the log group and log stream, as these
@@ -320,3 +322,27 @@ func AssertNoDuplicateLogs() LogEventsValidator {
return nil
}
}
+
+func GetLogEventCountPerType(logGroup, logStream string, since, until *time.Time) (map[string]int, error) {
+ var typeFrequency = make(map[string]int)
+ events, err := getLogsSince(logGroup, logStream, since, until)
+
+ // if there is an error, return the empty map
+ if err != nil {
+ return typeFrequency, err
+ }
+
+ typeFrequency[NoLogTypeFound] = 0
+ for _, event := range events {
+ message := *event.Message
+ var eksClusterType EKSClusterType
+ innerErr := json.Unmarshal([]byte(message), &eksClusterType)
+ if innerErr != nil {
+ typeFrequency[NoLogTypeFound]++
+ }
+
+ typeFrequency[eksClusterType.Type]++
+ }
+
+ return typeFrequency, nil
+}