From 766e41a00c26efef349c652973b4855cd336308a Mon Sep 17 00:00:00 2001 From: Aditya Purang <44022838+aditya-purang@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:29:10 +0100 Subject: [PATCH] add integration tests for AWS Neuron (#416) --- .../resources/dummy-neuron-monitor/Dockerfile | 32 + .../dummy_neuron_monitor.py | 890 ++++++++++++++++++ generator/test_case_generator.go | 4 + terraform/eks/daemon/awsneuron/main.tf | 826 ++++++++++++++++ terraform/eks/daemon/awsneuron/providers.tf | 17 + terraform/eks/daemon/awsneuron/variables.tf | 28 + test/awsneuron/neuron_metrics_test.go | 63 ++ test/awsneuron/neuron_test.go | 77 ++ test/awsneuron/resources/config.json | 16 + test/awsneuron/resources/httpd-ssl.conf | 43 + test/awsneuron/resources/httpd.conf | 101 ++ test/awsneuron/resources/metrics_list.go | 48 + test/metric/container_insights_util.go | 46 + .../test_schemas/container_neuroncore.json | 50 + .../test_schemas/node_neuron.json | 33 + .../test_schemas/node_neuroncore.json | 42 + .../test_schemas/node_neurondevice.json | 34 + .../test_schemas/pod_neuroncore.json | 49 + .../eks_resources/util.go | 57 +- util/awsservice/cloudwatchlogs.go | 26 + 20 files changed, 2464 insertions(+), 18 deletions(-) create mode 100644 docs/resources/dummy-neuron-monitor/Dockerfile create mode 100644 docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py create mode 100644 terraform/eks/daemon/awsneuron/main.tf create mode 100644 terraform/eks/daemon/awsneuron/providers.tf create mode 100644 terraform/eks/daemon/awsneuron/variables.tf create mode 100644 test/awsneuron/neuron_metrics_test.go create mode 100644 test/awsneuron/neuron_test.go create mode 100644 test/awsneuron/resources/config.json create mode 100644 test/awsneuron/resources/httpd-ssl.conf create mode 100644 test/awsneuron/resources/httpd.conf create mode 100644 test/awsneuron/resources/metrics_list.go create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json diff --git a/docs/resources/dummy-neuron-monitor/Dockerfile b/docs/resources/dummy-neuron-monitor/Dockerfile new file mode 100644 index 000000000..47b06ba33 --- /dev/null +++ b/docs/resources/dummy-neuron-monitor/Dockerfile @@ -0,0 +1,32 @@ +# set the base image +FROM public.ecr.aws/docker/library/ubuntu:20.04 + +# Set the working directory in the container +WORKDIR /root + +# Neuron SDK components version numbers +ARG NEURONX_RUNTIME_LIB_VERSION=2.19.* +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.19.* +ARG NEURONX_TOOLS_VERSION=2.17.* + +# Install any necessary dependencies or tools +RUN apt-get update && apt-get install vim wget zip unzip sudo python3-pip -y + +# Import Neuron GPG public key +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + + +# Install Neuron Runtime and Neuron Tools +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +COPY dummy_neuron_monitor.py /opt/aws/neuron/bin/dummy_neuron_monitor.py +RUN chmod 755 /opt/aws/neuron/bin/dummy_neuron_monitor.py +RUN pip3 install prometheus_client boto3 requests \ No newline at end of file diff --git a/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py b/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py new file mode 100644 index 000000000..22ccf7e74 --- /dev/null +++ b/docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py @@ -0,0 +1,890 @@ +#!/usr/bin/env python3 + +import sys +import json +import argparse +import signal +import hashlib +import time +from prometheus_client import start_http_server, Gauge, Counter, Info + + +def get_instance_labels(instance_info): + instance_labels = { + 'instance_name': instance_info['instance_name'], + 'instance_id': instance_info['instance_id'], + 'instance_type': instance_info['instance_type'], + 'availability_zone': instance_info['instance_availability_zone'], + 'region': instance_info['instance_region'], + 'subnet_id': instance_info['subnet_id'] + } + return instance_labels + + +def get_runtime_labels(instance_info, runtime_tag): + label_dict = instance_info.copy() + label_dict['runtime_tag'] = runtime_tag + return label_dict + + +def process_neuroncore_counters(group_obj, data, labels): + gauge_name = 'neuroncore_utilization_ratio' + labels['neuroncore'] = None + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'NeuronCore utilization ratio', labels.keys()) + for nc_idx, nc_data in data['neuroncores_in_use'].items(): + labels['neuroncore'] = int(nc_idx) + group_obj[gauge_name].labels(**labels).set(nc_data['neuroncore_utilization'] / 100.0) + + +def process_neuron_runtime_vcpu_usage(group_obj, data, labels): + gauge_name = 'neuron_runtime_vcpu_usage_ratio' + labels['usage_type'] = None + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'Runtime vCPU utilization ratio', labels.keys()) + cpu_usage_fields = ['user', 'system'] + for field in cpu_usage_fields: + labels['usage_type'] = field + group_obj[gauge_name].labels(**labels).set(data['vcpu_usage'][field] / 100.0) + + +def process_memory_used(group_obj, data, labels): + gauge_name = 'neuron_runtime_memory_used_bytes' + labels['memory_location'] = None + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'Runtime memory used bytes', labels.keys()) + mem_locations = ['host', 'neuron_device'] + for mem_location_type in mem_locations: + labels['memory_location'] = mem_location_type + group_obj[gauge_name].labels(**labels).set(data['neuron_runtime_used_bytes'][mem_location_type]) + + gauge_name_prefix = 'neuroncore_memory_usage_{}' + labels['neuroncore'] = None + labels['memory_location'] = None + neuroncore_memory_usage_type = ['constants', 'model_code', 'model_shared_scratchpad', 'runtime_memory', 'tensors'] + for memory_usage_type in neuroncore_memory_usage_type: + gauge_name = gauge_name_prefix.format(memory_usage_type) + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'NeuronCore memory utilization for {}'.format(memory_usage_type), labels.keys()) + for nc_idx, nc_data in data['neuron_runtime_used_bytes']['usage_breakdown']['neuroncore_memory_usage'].items(): + labels['neuroncore'] = int(nc_idx) + group_obj[gauge_name].labels(**labels).set(nc_data[memory_usage_type]) + + +def process_execution_stats(group_obj, data, labels): + counter_name = 'execution_errors_total' + err_labels = labels.copy() + err_labels['error_type'] = None + if counter_name not in group_obj: + group_obj[counter_name] = Counter(counter_name, 'Execution errors total', err_labels.keys()) + error_summary = data['error_summary'] + for error_type in error_summary: + err_labels['error_type'] = error_type + group_obj[counter_name].labels(**err_labels).inc(error_summary[error_type]) + + counter_name = 'execution_status_total' + status_labels = labels.copy() + status_labels['status_type'] = None + if counter_name not in group_obj: + group_obj[counter_name] = Counter(counter_name, 'Execution status total', status_labels.keys()) + execution_summary = data['execution_summary'] + for execution_outcome in execution_summary: + status_labels['status_type'] = execution_outcome + group_obj[counter_name].labels(**status_labels).inc(execution_summary[execution_outcome]) + + gauge_name = 'execution_latency_seconds' + latency_labels = labels.copy() + latency_labels['percentile'] = None + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'Execution latency in seconds', latency_labels.keys()) + latency_stats = data['latency_stats'] + if latency_stats['total_latency'] is not None: + for percentile in latency_stats['total_latency']: + latency_labels['percentile'] = percentile + group_obj[gauge_name].labels(**latency_labels).set(latency_stats['total_latency'][percentile]) + + +def process_neuron_hw_counters(group_obj, data, labels): + counter_name = 'hardware_ecc_events_total' + labels['event_type'] = None + labels['neuron_device_index'] = None + if counter_name not in group_obj: + group_obj[counter_name] = Counter(counter_name, 'Hardware ecc events total', labels.keys()) + hw_counters = ['mem_ecc_corrected', 'mem_ecc_uncorrected', 'sram_ecc_corrected', 'sram_ecc_uncorrected'] + for device in data['neuron_devices']: + for counter in hw_counters: + labels['event_type'] = counter + labels['neuron_device_index'] = device['neuron_device_index'] + group_obj[counter_name].labels(**labels).inc(device[counter]) + + +def process_vcpu_usage(group_obj, data, labels): + cpu_usage_aggregation = { + 'user': ['user', 'nice'], + 'system': ['system', 'io_wait', 'irq', 'soft_irq'] + } + gauge_name = 'system_vcpu_count' + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'System vCPU count', labels.keys()) + group_obj[gauge_name].labels(**labels).set(len(data['usage_data'])) + + labels['usage_type'] = None + gauge_name = 'system_vcpu_usage_ratio' + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, 'System CPU utilization ratio', labels.keys()) + for field, aggregated in cpu_usage_aggregation.items(): + aggregate_value = sum([data['average_usage'][item] for item in aggregated]) + aggregate_value = min(aggregate_value, 100.0) + labels['usage_type'] = field + group_obj[gauge_name].labels(**labels).set(aggregate_value / 100.0) + + +def process_memory_info(group_obj, data, labels): + for entries in [('memory', 'system_memory'), ('swap', 'system_swap')]: + for stat in ['total_bytes', 'used_bytes']: + gauge_name = '{}_{}'.format(entries[1], stat) + if gauge_name not in group_obj: + group_obj[gauge_name] = Gauge(gauge_name, + 'System {} {} bytes'.format(entries[0], stat), labels.keys()) + src_entry = '{}_{}'.format(entries[0], stat) + group_obj[gauge_name].labels(**labels).set(data[src_entry]) + + +def process_neuron_hardware_info(metric_objects, data, instance_data): + if 'neuron_hardware_info' not in metric_objects: + neuron_labels = { + 'neuron_device_count': str(data['neuron_device_count']), + 'neuroncore_per_device_count': str(data['neuroncore_per_device_count']) + } + neuron_labels.update(instance_data) + + metric_objects['neuron_hardware_info'] = Info('neuron_hardware', 'Neuron Hardware Information') + metric_objects['neuron_hardware_info'].info(neuron_labels) + + +def process_instance_info(metric_objects, instance_data): + if 'instance_info' not in metric_objects: + metric_objects['instance_info'] = Info('instance', 'EC2 instance information') + metric_objects['instance_info'].info(instance_data) + + +def process_report_entries(metric_objects, report_entries, labels, runtime_tag=None): + for metric_group_name, metric_group_data in report_entries.items(): + handler_name = 'process_{}'.format(metric_group_name) + if handler_name in globals(): + crt_error = metric_group_data['error'] + if crt_error == '': + if metric_group_name not in metric_objects: + metric_objects[metric_group_name] = {} + metric_group_object = metric_objects[metric_group_name] + globals()[handler_name](metric_group_object, metric_group_data, labels.copy()) + else: + if runtime_tag is not None: + print('Error getting {} for runtime tag {}: {}'.format( + metric_group_name, runtime_tag, crt_error), file=sys.stderr) + else: + print('Error getting {}: {}'.format(metric_group_name, crt_error), file=sys.stderr) + + +def process_data(metric_objects, monitor_data, instance_info): + if monitor_data.get('neuron_runtime_data', []): + for runtime in monitor_data['neuron_runtime_data']: + runtime_tag = runtime['neuron_runtime_tag'] + + if runtime['error'] != '': + print('Runtime {} error: {}'.format(runtime_tag, runtime['error']), file=sys.stderr) + continue + + process_report_entries(metric_objects, runtime['report'], + get_runtime_labels(instance_info, runtime_tag), runtime_tag) + else: # Reset gauges if no nueron_runtime is running + clear_gauges_from_metric_objects(metric_objects) + + if monitor_data['system_data'] is not None: + process_report_entries(metric_objects, monitor_data['system_data'], instance_info) + process_instance_info(metric_objects, instance_info) + if monitor_data['neuron_hardware_info'] is not None: + process_neuron_hardware_info(metric_objects, monitor_data['neuron_hardware_info'], instance_info) + +def clear_gauges_from_metric_objects(all_metric_objects): + for _, metricGroupedObjects in all_metric_objects.items(): + if(isinstance(metricGroupedObjects,dict)): + for _, metrics in metricGroupedObjects.items(): + if metrics._type == 'gauge' or metrics._type == 'counter': + metrics._metrics.clear() + +def _calculate_file_hash(file_path): + with open(file_path, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + return file_hash + +def _update_ssl_cxt(certfile, keyfile): + global ssl_cxt + ssl_cxt.load_cert_chain(certfile=certfile, keyfile=keyfile) + print("Refreshing TLS certificates") + +def _watch_file_and_update_ssl_cxt(original_hash, certfile, keyfile): + current_certfile_hash = _calculate_file_hash(certfile) + current_keyfile_hash = _calculate_file_hash(keyfile) + if (original_hash["certfile"] != current_certfile_hash) or (original_hash["keyfile"] != current_keyfile_hash): + _update_ssl_cxt(certfile, keyfile) + original_hash["certfile"] = current_certfile_hash + original_hash["keyfile"] = current_keyfile_hash + +def update_loop(certfile, keyfile): + running = True + + def signal_handler(*_): + nonlocal running + running = False + signal.signal(signal.SIGINT, signal_handler) + + """ Dictionary containing all prometheus client objects, first by metric group and + then by metric, for example, for neuroncore_counters: + all_metric_objects['neuroncore_counters']['neuroncore_utilization_ratio'] = Gauge(...) + """ + all_metric_objects = {} + original_file_hash = {} + instance_labels = None + if certfile and keyfile: + certfile_hash = _calculate_file_hash(certfile) + keyfile_hash = _calculate_file_hash(keyfile) + original_file_hash = {"certfile":certfile_hash,"keyfile":keyfile_hash} + + while running: + line = ('{"neuron_runtime_data":[{"pid":457402,"neuron_runtime_tag":"367","error":"","report":{' + '"execution_stats":{"period":4.999666547,"error_summary":{"generic":2,"numerical":0,"transient":0,' + '"model":0,"runtime":0,"hardware":0},"execution_summary":{"completed":2,"completed_with_err":0,' + '"completed_with_num_err":0,"timed_out":0,"incorrect_input":0,"failed_to_queue":0},"latency_stats":{' + '"total_latency":null,"device_latency":null},"error":""},"memory_used":{"period":4.999671285,' + '"neuron_runtime_used_bytes":{"host":9043968,"neuron_device":3541303936,"usage_breakdown":{"host":{' + '"application_memory":655360,"constants":0,"dma_buffers":8388608,"tensors":0},' + '"neuroncore_memory_usage":{"0":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"1":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"2":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"3":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"4":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"5":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"6":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"7":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"8":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"9":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"10":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"11":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"12":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"13":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"14":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"15":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"16":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"17":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"18":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"19":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"20":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"21":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"22":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"23":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"24":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"25":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"26":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"27":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"28":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"29":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"30":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"31":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852}}}},"loaded_models":[{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10019,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10005,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10007,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10013,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10029,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10032,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10004,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10012,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10001,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10016,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10022,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10024,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10026,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10031,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10025,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10021,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10006,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10011,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10010,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10015,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10008,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10027,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10018,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10030,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10017,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10003,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10002,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10009,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10020,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10028,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10014,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":8}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10023,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":8}}}],"error":""},"neuron_runtime_vcpu_usage":{"period":4.999647382,' + '"vcpu_usage":{"user":0,"system":0},"error":"open/proc/457402/stat:nosuchfileordirectory"},' + '"neuroncore_counters":{"period":4.999667932,"neuroncores_in_use":{"0":{"neuroncore_utilization":0},' + '"1":{"neuroncore_utilization":0},"2":{"neuroncore_utilization":0},"3":{"neuroncore_utilization":0},' + '"4":{"neuroncore_utilization":0},"5":{"neuroncore_utilization":0},"6":{"neuroncore_utilization": 41.4},' + '"7":{"neuroncore_utilization":0},"8":{"neuroncore_utilization":0},"9":{"neuroncore_utilization":0},' + '"10":{"neuroncore_utilization":0},"11":{"neuroncore_utilization":0},' + '"12":{"neuroncore_utilization":0},"13":{"neuroncore_utilization":0},' + '"14":{"neuroncore_utilization":0},"15":{"neuroncore_utilization":0},' + '"16":{"neuroncore_utilization":0},"17":{"neuroncore_utilization":0},' + '"18":{"neuroncore_utilization":0},"19":{"neuroncore_utilization":0},' + '"20":{"neuroncore_utilization":0},"21":{"neuroncore_utilization":0},' + '"22":{"neuroncore_utilization":0},"23":{"neuroncore_utilization":0},' + '"24":{"neuroncore_utilization":0},"25":{"neuroncore_utilization":0},' + '"26":{"neuroncore_utilization":0},"27":{"neuroncore_utilization":0},' + '"28":{"neuroncore_utilization":0},"29":{"neuroncore_utilization":0},' + '"30":{"neuroncore_utilization":0},"31":{"neuroncore_utilization":0}},"error":""}}}, {"pid":457402,"neuron_runtime_tag":"123","error":"","report":{' + '"execution_stats":{"period":4.999666547,"error_summary":{"generic":2,"numerical":0,"transient":0,' + '"model":0,"runtime":0,"hardware":0},"execution_summary":{"completed":2,"completed_with_err":0,' + '"completed_with_num_err":0,"timed_out":0,"incorrect_input":0,"failed_to_queue":0},"latency_stats":{' + '"total_latency":null,"device_latency":null},"error":""},"memory_used":{"period":4.999671285,' + '"neuron_runtime_used_bytes":{"host":9043968,"neuron_device":3541303936,"usage_breakdown":{"host":{' + '"application_memory":655360,"constants":0,"dma_buffers":8388608,"tensors":0},' + '"neuroncore_memory_usage":{"0":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"1":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"2":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"3":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"4":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"5":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"6":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"7":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"8":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"9":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"10":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"11":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"12":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"13":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"14":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"15":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"16":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"17":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"18":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"19":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"20":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"21":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"22":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"23":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"24":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"25":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"26":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"27":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852},"28":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,' + '"runtime_memory":0,"tensors":9912852},"29":{"constants":0,"model_code":100752896,' + '"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},"30":{"constants":0,' + '"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,"tensors":9912852},' + '"31":{"constants":0,"model_code":100752896,"model_shared_scratchpad":0,"runtime_memory":0,' + '"tensors":9912852}}}},"loaded_models":[{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10019,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10005,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":5}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10007,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10013,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":10}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10029,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10032,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":2}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10004,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10012,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":0}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10001,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10016,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":3}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10022,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10024,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":11}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10026,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10031,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":13}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10025,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10021,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":15}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10006,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10011,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":12}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10010,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10015,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":6}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10008,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10027,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":1}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10018,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10030,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":14}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10017,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10003,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":9}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10002,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10009,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":4}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10020,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10028,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":7}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10014,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":1,' + '"neuron_device_index":8}}},{"name":"2.2.0.73+0af5a171c-/neuronxcc-y_d15g2g",' + '"uuid":"302e6ea0c77b11ee846d26b89fc7ffab","model_id":10023,"is_running":false,"subgraphs":{"sg_00":{' + '"memory_used_bytes":{"host":20480,"neuron_device":24064,"usage_breakdown":{"host":{' + '"application_memory":20480,"constants":0,"dma_buffers":0,"tensors":0},"neuron_device":{' + '"constants":0,"model_code":24064,"runtime_memory":0,"tensors":0}}},"neuroncore_index":0,' + '"neuron_device_index":8}}}],"error":""},"neuron_runtime_vcpu_usage":{"period":4.999647382,' + '"vcpu_usage":{"user":0,"system":0},"error":"open/proc/457402/stat:nosuchfileordirectory"},' + '"neuroncore_counters":{"period":4.999667932,"neuroncores_in_use":{"0":{"neuroncore_utilization":0},' + '"1":{"neuroncore_utilization":0},"2":{"neuroncore_utilization":0},"3":{"neuroncore_utilization":0},' + '"4":{"neuroncore_utilization":0},"5":{"neuroncore_utilization":0},"6":{"neuroncore_utilization":0},' + '"7":{"neuroncore_utilization":0},"8":{"neuroncore_utilization":0},"9":{"neuroncore_utilization":0},' + '"10":{"neuroncore_utilization":0},"11":{"neuroncore_utilization":0},' + '"12":{"neuroncore_utilization":0},"13":{"neuroncore_utilization":0},' + '"14":{"neuroncore_utilization":0},"15":{"neuroncore_utilization":0},' + '"16":{"neuroncore_utilization":0},"17":{"neuroncore_utilization":0},' + '"18":{"neuroncore_utilization":0},"19":{"neuroncore_utilization":0},' + '"20":{"neuroncore_utilization":0},"21":{"neuroncore_utilization":0},' + '"22":{"neuroncore_utilization":0},"23":{"neuroncore_utilization":0},' + '"24":{"neuroncore_utilization":0},"25":{"neuroncore_utilization":0},' + '"26":{"neuroncore_utilization":0},"27":{"neuroncore_utilization":0},' + '"28":{"neuroncore_utilization":0},"29":{"neuroncore_utilization":0},' + '"30":{"neuroncore_utilization":0},"31":{"neuroncore_utilization":0}},"error":""}}}],"system_data":{' + '"memory_info":{"period":4.9997283150000005,"memory_total_bytes":532523487232,' + '"memory_used_bytes":81207975936,"swap_total_bytes":0,"swap_used_bytes":0,"error":""},"vcpu_usage":{' + '"period":4.999737702,"average_usage":{"user":19.66,"nice":0,"system":1.67,"idle":78.67,"io_wait":0,' + '"irq":0,"soft_irq":0},"usage_data":{"0":{"user":51.31,"nice":0,"system":0,"idle":48.69,"io_wait":0,' + '"irq":0,"soft_irq":0},"1":{"user":52.91,"nice":0,"system":6.21,"idle":40.88,"io_wait":0,"irq":0,' + '"soft_irq":0},"2":{"user":25.6,"nice":0,"system":0,"idle":74.4,"io_wait":0,"irq":0,"soft_irq":0},' + '"3":{"user":0.6,"nice":0,"system":0,"idle":99.4,"io_wait":0,"irq":0,"soft_irq":0},"4":{"user":0.2,' + '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"5":{"user":0.8,"nice":0,' + '"system":0,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"6":{"user":1,"nice":0,"system":2.99,' + '"idle":96.01,"io_wait":0,"irq":0,"soft_irq":0},"7":{"user":0,"nice":0,"system":0.2,"idle":99.8,' + '"io_wait":0,"irq":0,"soft_irq":0},"8":{"user":1.8,"nice":0,"system":0,"idle":98.2,"io_wait":0,' + '"irq":0,"soft_irq":0},"9":{"user":0.4,"nice":0,"system":0.8,"idle":98.8,"io_wait":0,"irq":0,' + '"soft_irq":0},"10":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},' + '"11":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"12":{"user":0,' + '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"13":{"user":0,"nice":0,"system":0,' + '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"14":{"user":0,"nice":0,"system":0.2,"idle":99.8,' + '"io_wait":0,"irq":0,"soft_irq":0},"15":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,' + '"irq":0,"soft_irq":0},"16":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,' + '"soft_irq":0},"17":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"18":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"19":{"user":0,' + '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"20":{"user":0.4,"nice":0,' + '"system":0.4,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"21":{"user":0.2,"nice":0,"system":0.2,' + '"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"22":{"user":0.2,"nice":0,"system":0,"idle":99.8,' + '"io_wait":0,"irq":0,"soft_irq":0},"23":{"user":0,"nice":0,"system":0.2,"idle":99.8,"io_wait":0,' + '"irq":0,"soft_irq":0},"24":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,' + '"soft_irq":0},"25":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},' + '"26":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"27":{"user":0.2,"nice":0,"system":0.4,"idle":99.4,"io_wait":0,"irq":0,"soft_irq":0},' + '"28":{"user":0.2,"nice":0,"system":0.8,"idle":99,"io_wait":0,"irq":0,"soft_irq":0},"29":{"user":0.2,' + '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"30":{"user":0,"nice":0,' + '"system":0.4,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"31":{"user":0,"nice":0,"system":0,' + '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"32":{"user":0.2,"nice":0,"system":1,"idle":98.8,' + '"io_wait":0,"irq":0,"soft_irq":0},"33":{"user":1.81,"nice":0,"system":4.44,"idle":93.75,"io_wait":0,' + '"irq":0,"soft_irq":0},"34":{"user":5.22,"nice":0,"system":0.2,"idle":94.58,"io_wait":0,"irq":0,' + '"soft_irq":0},"35":{"user":22,"nice":0,"system":0,"idle":78,"io_wait":0,"irq":0,"soft_irq":0},' + '"36":{"user":47.31,"nice":0,"system":2.79,"idle":49.9,"io_wait":0,"irq":0,"soft_irq":0},' + '"37":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"38":{"user":5.2,"nice":0,"system":3.8,"idle":91,"io_wait":0,"irq":0,"soft_irq":0},' + '"39":{"user":72.69,"nice":0,"system":5.42,"idle":21.89,"io_wait":0,"irq":0,"soft_irq":0},' + '"40":{"user":75.85,"nice":0,"system":2.4,"idle":21.76,"io_wait":0,"irq":0,"soft_irq":0},' + '"41":{"user":1,"nice":0,"system":3.6,"idle":95.4,"io_wait":0,"irq":0,"soft_irq":0},' + '"42":{"user":4.58,"nice":0,"system":3.78,"idle":91.63,"io_wait":0,"irq":0,"soft_irq":0},' + '"43":{"user":7.62,"nice":0,"system":5.21,"idle":87.17,"io_wait":0,"irq":0,"soft_irq":0},' + '"44":{"user":6.22,"nice":0,"system":2.81,"idle":90.96,"io_wait":0,"irq":0,"soft_irq":0},' + '"45":{"user":1.81,"nice":0,"system":4.62,"idle":93.57,"io_wait":0,"irq":0,"soft_irq":0},' + '"46":{"user":1.8,"nice":0,"system":6.21,"idle":91.98,"io_wait":0,"irq":0,"soft_irq":0},' + '"47":{"user":1.6,"nice":0,"system":5,"idle":93.4,"io_wait":0,"irq":0,"soft_irq":0},"48":{"user":0,' + '"nice":0,"system":0.2,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"49":{"user":0.2,"nice":0,' + '"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"50":{"user":0,"nice":0,"system":1.4,' + '"idle":98.6,"io_wait":0,"irq":0,"soft_irq":0},"51":{"user":76.15,"nice":0,"system":2.81,' + '"idle":21.04,"io_wait":0,"irq":0,"soft_irq":0},"52":{"user":8.2,"nice":0,"system":3.4,"idle":88.4,' + '"io_wait":0,"irq":0,"soft_irq":0},"53":{"user":8.62,"nice":0,"system":3.61,"idle":87.78,"io_wait":0,' + '"irq":0,"soft_irq":0},"54":{"user":7.62,"nice":0,"system":1,"idle":91.38,"io_wait":0,"irq":0,' + '"soft_irq":0},"55":{"user":75.3,"nice":0,"system":0.6,"idle":24.1,"io_wait":0,"irq":0,"soft_irq":0},' + '"56":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"57":{"user":0,' + '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"58":{"user":0,"nice":0,"system":0,' + '"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"59":{"user":75.2,"nice":0,"system":0.6,"idle":24.2,' + '"io_wait":0,"irq":0,"soft_irq":0},"60":{"user":70.46,"nice":0,"system":0,"idle":29.54,"io_wait":0,' + '"irq":0,"soft_irq":0},"61":{"user":70.34,"nice":0,"system":0,"idle":29.66,"io_wait":0,"irq":0,' + '"soft_irq":0},"62":{"user":72.8,"nice":0,"system":0,"idle":27.2,"io_wait":0,"irq":0,"soft_irq":0},' + '"63":{"user":73.2,"nice":0,"system":3,"idle":23.8,"io_wait":0,"irq":0,"soft_irq":0},' + '"64":{"user":19.8,"nice":0,"system":1,"idle":79.2,"io_wait":0,"irq":0,"soft_irq":0},' + '"65":{"user":0.8,"nice":0,"system":0,"idle":99.2,"io_wait":0,"irq":0,"soft_irq":0},"66":{"user":0.2,' + '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"67":{"user":0.2,"nice":0,' + '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"68":{"user":6.24,"nice":0,"system":1.61,' + '"idle":92.15,"io_wait":0,"irq":0,"soft_irq":0},"69":{"user":0.2,"nice":0,"system":0,"idle":99.8,' + '"io_wait":0,"irq":0,"soft_irq":0},"70":{"user":0.6,"nice":0,"system":2.59,"idle":96.81,"io_wait":0,' + '"irq":0,"soft_irq":0},"71":{"user":2.79,"nice":0,"system":5.38,"idle":91.83,"io_wait":0,"irq":0,' + '"soft_irq":0},"72":{"user":1.6,"nice":0,"system":6.01,"idle":92.38,"io_wait":0,"irq":0,' + '"soft_irq":0},"73":{"user":0.2,"nice":0,"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"74":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"75":{"user":0.2,' + '"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"76":{"user":0.2,"nice":0,' + '"system":0.2,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"77":{"user":2.2,"nice":0,"system":5.4,' + '"idle":92.4,"io_wait":0,"irq":0,"soft_irq":0},"78":{"user":1.4,"nice":0,"system":5.01,"idle":93.59,' + '"io_wait":0,"irq":0,"soft_irq":0},"79":{"user":0.4,"nice":0,"system":0,"idle":99.6,"io_wait":0,' + '"irq":0,"soft_irq":0},"80":{"user":0,"nice":0,"system":0.4,"idle":99.6,"io_wait":0,"irq":0,' + '"soft_irq":0},"81":{"user":2,"nice":0,"system":5.59,"idle":92.42,"io_wait":0,"irq":0,"soft_irq":0},' + '"82":{"user":2.6,"nice":0,"system":6.4,"idle":91,"io_wait":0,"irq":0,"soft_irq":0},' + '"83":{"user":2.79,"nice":0,"system":6.37,"idle":90.84,"io_wait":0,"irq":0,"soft_irq":0},' + '"84":{"user":2.2,"nice":0,"system":5.2,"idle":92.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"85":{"user":0.2,"nice":0,"system":1.4,"idle":98.4,"io_wait":0,"irq":0,"soft_irq":0},' + '"86":{"user":2.4,"nice":0,"system":5.21,"idle":92.38,"io_wait":0,"irq":0,"soft_irq":0},' + '"87":{"user":2.4,"nice":0,"system":6.61,"idle":90.98,"io_wait":0,"irq":0,"soft_irq":0},' + '"88":{"user":2.2,"nice":0,"system":6.61,"idle":91.18,"io_wait":0,"irq":0,"soft_irq":0},' + '"89":{"user":0,"nice":0,"system":0.4,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},"90":{"user":2.4,' + '"nice":0,"system":5,"idle":92.6,"io_wait":0,"irq":0,"soft_irq":0},"91":{"user":0.2,"nice":0,' + '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"92":{"user":0,"nice":0,"system":0.2,' + '"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"93":{"user":0,"nice":0,"system":0,"idle":100,' + '"io_wait":0,"irq":0,"soft_irq":0},"94":{"user":0,"nice":0,"system":0.2,"idle":99.8,"io_wait":0,' + '"irq":0,"soft_irq":0},"95":{"user":2,"nice":0,"system":5.19,"idle":92.81,"io_wait":0,"irq":0,' + '"soft_irq":0},"96":{"user":76.2,"nice":0,"system":6.8,"idle":17,"io_wait":0,"irq":0,"soft_irq":0},' + '"97":{"user":75.15,"nice":0,"system":1.6,"idle":23.25,"io_wait":0,"irq":0,"soft_irq":0},' + '"98":{"user":1.4,"nice":0,"system":4.81,"idle":93.79,"io_wait":0,"irq":0,"soft_irq":0},' + '"99":{"user":1.39,"nice":0,"system":4.78,"idle":93.82,"io_wait":0,"irq":0,"soft_irq":0},' + '"100":{"user":2.19,"nice":0,"system":5.38,"idle":92.43,"io_wait":0,"irq":0,"soft_irq":0},' + '"101":{"user":75.05,"nice":0,"system":0.6,"idle":24.35,"io_wait":0,"irq":0,"soft_irq":0},' + '"102":{"user":77.4,"nice":0,"system":3.8,"idle":18.8,"io_wait":0,"irq":0,"soft_irq":0},' + '"103":{"user":0,"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"104":{"user":0,' + '"nice":0,"system":0,"idle":100,"io_wait":0,"irq":0,"soft_irq":0},"105":{"user":74.95,"nice":0,' + '"system":1.2,"idle":23.85,"io_wait":0,"irq":0,"soft_irq":0},"106":{"user":76.8,"nice":0,' + '"system":0.4,"idle":22.8,"io_wait":0,"irq":0,"soft_irq":0},"107":{"user":76.95,"nice":0,' + '"system":1.2,"idle":21.84,"io_wait":0,"irq":0,"soft_irq":0},"108":{"user":78.71,"nice":0,' + '"system":7.43,"idle":13.86,"io_wait":0,"irq":0,"soft_irq":0},"109":{"user":75.05,"nice":0,' + '"system":0.4,"idle":24.55,"io_wait":0,"irq":0,"soft_irq":0},"110":{"user":75.15,"nice":0,' + '"system":0.4,"idle":24.45,"io_wait":0,"irq":0,"soft_irq":0},"111":{"user":75.15,"nice":0,' + '"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},"112":{"user":75.15,"nice":0,' + '"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},"113":{"user":74.85,"nice":0,' + '"system":1.2,"idle":23.95,"io_wait":0,"irq":0,"soft_irq":0},"114":{"user":74.85,"nice":0,"system":1,' + '"idle":24.15,"io_wait":0,"irq":0,"soft_irq":0},"115":{"user":0,"nice":0,"system":0,"idle":100,' + '"io_wait":0,"irq":0,"soft_irq":0},"116":{"user":77.84,"nice":0,"system":0.8,"idle":21.36,' + '"io_wait":0,"irq":0,"soft_irq":0},"117":{"user":78.2,"nice":0,"system":0.4,"idle":21.4,"io_wait":0,' + '"irq":0,"soft_irq":0},"118":{"user":77.8,"nice":0,"system":1,"idle":21.2,"io_wait":0,"irq":0,' + '"soft_irq":0},"119":{"user":0.4,"nice":0,"system":0,"idle":99.6,"io_wait":0,"irq":0,"soft_irq":0},' + '"120":{"user":75.2,"nice":0,"system":0.4,"idle":24.4,"io_wait":0,"irq":0,"soft_irq":0},' + '"121":{"user":75.15,"nice":0,"system":0.6,"idle":24.25,"io_wait":0,"irq":0,"soft_irq":0},' + '"122":{"user":74.8,"nice":0,"system":1,"idle":24,"io_wait":0,"irq":0,"soft_irq":0.2},' + '"123":{"user":0.2,"nice":0,"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"124":{"user":0,' + '"nice":0,"system":0.2,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"125":{"user":0.2,"nice":0,' + '"system":0,"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"126":{"user":0,"nice":0,"system":0.2,' + '"idle":99.8,"io_wait":0,"irq":0,"soft_irq":0},"127":{"user":1.2,"nice":0,"system":3,"idle":95.8,' + '"io_wait":0,"irq":0,"soft_irq":0}},"context_switch_count":171386,"error":""},"neuron_hw_counters":{' + '"period":1.000142057,"neuron_devices":[{"neuron_device_index":0,"mem_ecc_corrected":1,' + '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":1,' + '"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},' + '{"neuron_device_index":2,"mem_ecc_corrected":1,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,' + '"sram_ecc_corrected":0},{"neuron_device_index":3,"mem_ecc_corrected":2,"mem_ecc_uncorrected":0,' + '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":4,"mem_ecc_corrected":0,' + '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":5,' + '"mem_ecc_corrected":1,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},' + '{"neuron_device_index":6,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":0,' + '"sram_ecc_corrected":0},{"neuron_device_index":7,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,' + '"sram_ecc_uncorrected":1,"sram_ecc_corrected":0},{"neuron_device_index":8,"mem_ecc_corrected":0,' + '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":1},{"neuron_device_index":9,' + '"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},' + '{"neuron_device_index":10,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,' + '"sram_ecc_corrected":0},{"neuron_device_index":11,"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,' + '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0},{"neuron_device_index":12,"mem_ecc_corrected":0,' + '"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":1,"sram_ecc_corrected":0},{"neuron_device_index":13,' + '"mem_ecc_corrected":0,"mem_ecc_uncorrected":0,"sram_ecc_uncorrected":0,"sram_ecc_corrected":1},' + '{"neuron_device_index":14,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,"sram_ecc_uncorrected":1,' + '"sram_ecc_corrected":0},{"neuron_device_index":15,"mem_ecc_corrected":0,"mem_ecc_uncorrected":1,' + '"sram_ecc_uncorrected":0,"sram_ecc_corrected":0}],"error":""}},"instance_info":{' + '"instance_name":"DummyNodeName",' + '"instance_id":"i-09db9b55e0095612f","instance_type":"trn1n.32xlarge",' + '"instance_availability_zone":"us-east-1c","instance_availability_zone_id":"use1-az6",' + '"instance_region":"us-east-1","ami_id":"ami-030686a4e905e98d3",' + '"subnet_id":"subnet-06a7754948e8a000f","error":""},"neuron_hardware_info":{"neuron_device_count":16,' + '"neuroncore_per_device_count":2,"error":""}}') + if len(line) == 0: + continue + if original_file_hash: + _watch_file_and_update_ssl_cxt(original_file_hash, certfile=certfile, keyfile=keyfile) + try: + monitor_data = json.loads(line) + except Exception as exc: + print('Unable to decode JSON {}'.format(exc)) + continue + if instance_labels is None: + instance_labels = get_instance_labels(monitor_data['instance_info']) + process_data(all_metric_objects, monitor_data, instance_labels) + time.sleep(5) + +def main(): + global ssl_cxt + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('-p', '--port', default=8000, + type=int, help='HTTP port on which to run the server') + arg_parser.add_argument('--key-file', help='Path to SSL private key file (only for HTTPS)') + arg_parser.add_argument('--cert-file', help='Path to SSL certificate file (only for HTTPS)') + args = arg_parser.parse_args() + + if args.key_file and args.cert_file: + if sys.version_info < (3, 8): + print("""Python version 3.8 or greater is requried for https/tls support. + Also upgrade your prometheus_client version to 0.19.0 or greater if required + https://github.com/prometheus/client_python/releases""") + sys.exit(1) + httpd, _t = start_http_server(port=args.port, keyfile=args.key_file, certfile=args.cert_file) + ssl_cxt = httpd.socket.context + print("Running HTTPS prometheus server at port {}".format(args.port)) + else: + start_http_server(port=args.port) + print("Running HTTP prometheus server at port {}".format(args.port)) + + update_loop(certfile = args.cert_file or None, keyfile=args.key_file or None) + + +if __name__ == '__main__': + main() diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index f7f042267..a0021e438 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -223,6 +223,10 @@ var testTypeToTestConfig = map[string][]testConfig{ testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu", targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, + { + testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + }, }, "eks_deployment": { {testDir: "./test/metric_value_benchmark"}, diff --git a/terraform/eks/daemon/awsneuron/main.tf b/terraform/eks/daemon/awsneuron/main.tf new file mode 100644 index 000000000..87ea02b0a --- /dev/null +++ b/terraform/eks/daemon/awsneuron/main.tf @@ -0,0 +1,826 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = "AL2_x86_64" + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = ["t3.medium"] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and neuron monitor +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "neuron-monitor-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "neuron-monitor-service.amazon-cloudwatch.svc"] + subject { + common_name = "neuron-monitor-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + +resource "kubernetes_config_map" "neuron_monitor_config_map" { + depends_on = [ + kubernetes_namespace.namespace + ] + + metadata { + name = "neuron-monitor-config-map" + namespace = "amazon-cloudwatch" + } + + data = { + "monitor.json" = jsonencode({ + period = "5s" + neuron_runtimes = [ + { + tag_filter : ".*" + metrics = [ + { + type = "neuroncore_counters" + }, + { + type = "memory_used" + }, + { + type = "neuron_runtime_vcpu_usage" + }, + { + type = "execution_stats" + } + ] + } + ] + system_metrics = [ + { + type = "memory_info" + }, + { + period = "5s" + type = "neuron_hw_counters" + } + ] + }) + } +} + +resource "kubernetes_service_account" "neuron_monitor_service_account" { + depends_on = [ + kubernetes_namespace.namespace + ] + metadata { + name = "neuron-monitor-service-acct" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_role" "neuron_monitor_role" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.neuron_monitor_service_account, + kubernetes_config_map.neuron_monitor_config_map + ] + metadata { + name = "neuron-monitor-role" + namespace = "amazon-cloudwatch" + } + + rule { + api_groups = [""] + resources = ["configmaps"] + resource_names = ["neuron-monitor-config-map"] + verbs = ["get"] + } +} + +resource "kubernetes_role_binding" "neuron_monitor_role_binding" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.neuron_monitor_service_account, + kubernetes_role.neuron_monitor_role + ] + + metadata { + namespace = "amazon-cloudwatch" + name = "neuron-monitor-role-binding" + } + + role_ref { + kind = "Role" + name = "neuron-monitor-role" + api_group = "rbac.authorization.k8s.io" + } + + subject { + kind = "ServiceAccount" + name = "neuron-monitor-service-acct" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_daemonset" "neuron_monitor" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.neuron_monitor_service_account, + kubernetes_role.neuron_monitor_role, + kubernetes_role_binding.neuron_monitor_role_binding, + kubernetes_config_map.neuron_monitor_config_map + ] + + metadata { + name = "neuron-monitor" + namespace = "amazon-cloudwatch" + labels = { + k8s-app = "neuron-monitor" + version = "v1" + } + } + spec { + selector { + match_labels = { + k8s-app = "neuron-monitor" + } + } + template { + metadata { + labels = { + k8s-app = "neuron-monitor" + version = "v1" + } + } + spec { + affinity { + node_affinity { + required_during_scheduling_ignored_during_execution { + node_selector_term { + match_expressions { + key = "kubernetes.io/os" + operator = "In" + values = ["linux"] + } + } + } + } + } + container { + name = "neuron-monitor-prometheus" + image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:v2" + port { + container_port = 8000 + } + command = [ + "/bin/sh", + "-c", + "/opt/aws/neuron/bin/dummy_neuron_monitor.py --port 8000 --cert-file /etc/amazon-cloudwatch-observability-neuron-cert/server.crt --key-file /etc/amazon-cloudwatch-observability-neuron-cert/server.key" + ] + resources { + limits = { + cpu = "500m" + memory = "256Mi" + } + requests = { + cpu = "256m" + memory = "128Mi" + } + } + security_context { + privileged = true + } + env { + name = "NODE_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "PATH" + value = "/usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin" + } + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-neuron-cert/" + name = "neurontls" + read_only = true + } + volume_mount { + mount_path = "/etc/neuron-monitor-config/" + name = "neuron-monitor-config" + read_only = true + } + } + volume { + name = "neurontls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "tls.crt" + path = "server.crt" + } + items { + key = "tls.key" + path = "server.key" + } + } + } + volume { + name = "neuron-monitor-config" + config_map { + name = "neuron-monitor-config-map" + } + } + service_account_name = "neuron-monitor-service-acct" + } + } + } +} + +resource "kubernetes_service" "neuron_monitor_service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_daemonset.neuron_monitor + ] + metadata { + name = "neuron-monitor-service" + namespace = "amazon-cloudwatch" + labels = { + "k8s-app" : "neuron-monitor-service" + } + annotations = { + "prometheus.io/scrape" : "true" + } + } + spec { + type = "ClusterIP" + selector = { + k8s-app = "neuron-monitor" + } + port { + name = "metrics" + port = 8000 + target_port = 8000 + protocol = "TCP" + } + internal_traffic_policy = "Local" + } +} + +resource "kubernetes_daemonset" "service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_daemonset.neuron_monitor + ] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } + spec { + selector { + match_labels = { + "name" : "cloudwatch-agent" + } + } + template { + metadata { + labels = { + "name" : "cloudwatch-agent" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "cwagent" + image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" + image_pull_policy = "Always" + resources { + limits = { + "cpu" : "200m", + "memory" : "200Mi" + } + requests = { + "cpu" : "200m", + "memory" : "200Mi" + } + } + port { + container_port = 25888 + host_port = 25888 + protocol = "UDP" + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + volume_mount { + mount_path = "/etc/cwagentconfig" + name = "cwagentconfig" + } + volume_mount { + mount_path = "/rootfs" + name = "rootfs" + read_only = true + } + volume_mount { + mount_path = "/var/run/docker.sock" + name = "dockersock" + read_only = true + } + volume_mount { + mount_path = "/var/lib/docker" + name = "varlibdocker" + read_only = true + } + volume_mount { + mount_path = "/run/containerd/containerd.sock" + name = "containerdsock" + read_only = true + } + volume_mount { + mount_path = "/sys" + name = "sys" + read_only = true + } + volume_mount { + mount_path = "/dev/disk" + name = "devdisk" + read_only = true + } + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-agent-cert" + name = "agenttls" + read_only = true + } + volume_mount { + mount_path = "/var/lib/kubelet/pod-resources" + name = "kubelet-podresources" + read_only = true + } + } + volume { + name = "cwagentconfig" + config_map { + name = "cwagentconfig" + } + } + volume { + name = "rootfs" + host_path { + path = "/" + } + } + volume { + name = "dockersock" + host_path { + path = "/var/run/docker.sock" + } + } + volume { + name = "varlibdocker" + host_path { + path = "/var/lib/docker" + } + } + volume { + name = "containerdsock" + host_path { + path = "/run/containerd/containerd.sock" + } + } + volume { + name = "sys" + host_path { + path = "/sys" + } + } + volume { + name = "devdisk" + host_path { + path = "/dev/disk" + } + } + volume { + name = "kubelet-podresources" + host_path { + path = "/var/lib/kubelet/pod-resources" + } + } + volume { + name = "agenttls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "ca.crt" + path = "tls-ca.crt" + } + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +########################################## +# Template Files +########################################## +locals { + httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" + httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +data "template_file" "httpd_config" { + template = file(local.httpd_config) + vars = {} +} +data "template_file" "httpd_ssl_config" { + template = file(local.httpd_ssl_config) + vars = {} +} + +resource "kubernetes_config_map" "httpdconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "httpdconfig" + namespace = "amazon-cloudwatch" + } + data = { + "httpd.conf" : data.template_file.httpd_config.rendered + "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered + } +} + +resource "kubernetes_service_account" "cwagentservice" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["get"] + resource_names = ["neuron-monitor-config-map"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_daemonset.service, + kubernetes_cluster_role_binding.rolebinding, + kubernetes_service_account.cwagentservice, + ] + provisioner "local-exec" { + command = <<-EOT + echo "Validating EKS metrics/logs for AWS Neuron" + cd ../../../.. + go test -timeout 30m ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON + EOT + } +} diff --git a/terraform/eks/daemon/awsneuron/providers.tf b/terraform/eks/daemon/awsneuron/providers.tf new file mode 100644 index 000000000..9bd2885f5 --- /dev/null +++ b/terraform/eks/daemon/awsneuron/providers.tf @@ -0,0 +1,17 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/eks/daemon/awsneuron/variables.tf b/terraform/eks/daemon/awsneuron/variables.tf new file mode 100644 index 000000000..4cb2a7904 --- /dev/null +++ b/terraform/eks/daemon/awsneuron/variables.tf @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "./test/awsneuron" +} + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + diff --git a/test/awsneuron/neuron_metrics_test.go b/test/awsneuron/neuron_metrics_test.go new file mode 100644 index 000000000..da9ac577f --- /dev/null +++ b/test/awsneuron/neuron_metrics_test.go @@ -0,0 +1,63 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package awsneuron + +import ( + "time" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + . "github.com/aws/amazon-cloudwatch-agent-test/test/awsneuron/resources" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +const ( + awsNeuronMetricIndicator = "_neuron" +) + +var expectedDimsToMetrics = map[string][]string{ + "ClusterName": { + NodeNeuronCoreUtil, NodeNeuronCoreMemUsageConstants, NodeNeuronCoreMemUsageModel, NodeNeuronCoreMemUsageScratchpad, + NodeNeuronCoreMemUsageRuntime, NodeNeuronCoreMemUsageTensors, NodeNeuronCoreMemUsageTotal, NodeNeuronDeviceHwEccEvents, + NodeExecutionErrorsTotal, NodeNeuronDeviceRuntimeMemoryUsed, NodeNeuronExecutionLatency, + }, +} + +type AwsNeuronTestRunner struct { + test_runner.BaseTestRunner + testName string + env *environment.MetaData +} + +var _ test_runner.ITestRunner = (*AwsNeuronTestRunner)(nil) + +func (t *AwsNeuronTestRunner) Validate() status.TestGroupResult { + var testResults []status.TestResult + testResults = append(testResults, metric.ValidateMetrics(t.env, awsNeuronMetricIndicator, expectedDimsToMetrics)...) + testResults = append(testResults, metric.ValidateLogs(t.env)) + testResults = append(testResults, metric.ValidateLogsFrequency(t.env)) + return status.TestGroupResult{ + Name: t.GetTestName(), + TestResults: testResults, + } +} + +func (t *AwsNeuronTestRunner) GetTestName() string { + return t.testName +} + +func (t *AwsNeuronTestRunner) GetAgentConfigFileName() string { + return "" +} + +func (t *AwsNeuronTestRunner) GetAgentRunDuration() time.Duration { + return 25 * time.Minute +} + +func (t *AwsNeuronTestRunner) GetMeasuredMetrics() []string { + return nil +} diff --git a/test/awsneuron/neuron_test.go b/test/awsneuron/neuron_test.go new file mode 100644 index 000000000..6ccc4f599 --- /dev/null +++ b/test/awsneuron/neuron_test.go @@ -0,0 +1,77 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package awsneuron + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/suite" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +type AwsNeuronTestSuite struct { + suite.Suite + test_runner.TestSuite +} + +func (suite *AwsNeuronTestSuite) SetupSuite() { + fmt.Println(">>>> Starting AWS Neuron Container Insights TestSuite") +} + +func (suite *AwsNeuronTestSuite) TearDownSuite() { + suite.Result.Print() + fmt.Println(">>>> Finished AWS Neuron Container Insights TestSuite") +} + +func init() { + environment.RegisterEnvironmentMetaDataFlags() +} + +var ( + eksTestRunners []*test_runner.EKSTestRunner +) + +func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { + if eksTestRunners == nil { + factory := dimension.GetDimensionFactory(*env) + + eksTestRunners = []*test_runner.EKSTestRunner{ + { + Runner: &AwsNeuronTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_AWS_NEURON", env}, + Env: *env, + }, + } + } + return eksTestRunners +} + +func (suite *AwsNeuronTestSuite) TestAllInSuite() { + env := environment.GetEnvironmentMetaData() + switch env.ComputeType { + case computetype.EKS: + for _, testRunner := range getEksTestRunners(env) { + testRunner.Run(suite, env) + } + default: + return + } + + suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "AWS Neuron Container Test Suite Failed") +} + +func (suite *AwsNeuronTestSuite) AddToSuiteResult(r status.TestGroupResult) { + suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r) +} + +func TestAWSNeuronSuite(t *testing.T) { + suite.Run(t, new(AwsNeuronTestSuite)) +} diff --git a/test/awsneuron/resources/config.json b/test/awsneuron/resources/config.json new file mode 100644 index 000000000..6f37e43ed --- /dev/null +++ b/test/awsneuron/resources/config.json @@ -0,0 +1,16 @@ +{ + "agent": { + "metrics_collection_interval": 15, + "run_as_user": "root", + "debug": true, + "logfile": "" + }, + "logs": { + "metrics_collected": { + "kubernetes": { + "enhanced_container_insights": true + } + }, + "force_flush_interval": 5 + } +} \ No newline at end of file diff --git a/test/awsneuron/resources/httpd-ssl.conf b/test/awsneuron/resources/httpd-ssl.conf new file mode 100644 index 000000000..18c33f0bd --- /dev/null +++ b/test/awsneuron/resources/httpd-ssl.conf @@ -0,0 +1,43 @@ +Listen 8000 + +SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES +SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES + +SSLHonorCipherOrder on + +SSLProtocol all -SSLv3 +SSLProxyProtocol all -SSLv3 + +SSLPassPhraseDialog builtin + +SSLSessionCache "shmcb:/usr/local/apache2/logs/ssl_scache(512000)" +SSLSessionCacheTimeout 300 + + + + +DocumentRoot "/usr/local/apache2/htdocs" +ServerName neuron-monitor-service.amazon-cloudwatch.svc:8000 +ServerAdmin you@example.com +ErrorLog /proc/self/fd/2 +TransferLog /proc/self/fd/1 + +SSLEngine on +SSLCertificateFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt" +SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.key" + + + SSLOptions +StdEnvVars + + + SSLOptions +StdEnvVars + + +BrowserMatch "MSIE [2-5]" \ + nokeepalive ssl-unclean-shutdown \ + downgrade-1.0 force-response-1.0 + +CustomLog /proc/self/fd/1 \ + "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b" + + \ No newline at end of file diff --git a/test/awsneuron/resources/httpd.conf b/test/awsneuron/resources/httpd.conf new file mode 100644 index 000000000..122b16b17 --- /dev/null +++ b/test/awsneuron/resources/httpd.conf @@ -0,0 +1,101 @@ + +ServerRoot "/usr/local/apache2" + +#Listen 8000 + +LoadModule mpm_event_module modules/mod_mpm_event.so +LoadModule authn_file_module modules/mod_authn_file.so +LoadModule authn_core_module modules/mod_authn_core.so +LoadModule authz_host_module modules/mod_authz_host.so +LoadModule authz_groupfile_module modules/mod_authz_groupfile.so +LoadModule authz_user_module modules/mod_authz_user.so +LoadModule authz_core_module modules/mod_authz_core.so +LoadModule access_compat_module modules/mod_access_compat.so +LoadModule auth_basic_module modules/mod_auth_basic.so +LoadModule socache_shmcb_module modules/mod_socache_shmcb.so +LoadModule reqtimeout_module modules/mod_reqtimeout.so +LoadModule filter_module modules/mod_filter.so +LoadModule mime_module modules/mod_mime.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule headers_module modules/mod_headers.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule version_module modules/mod_version.so +LoadModule ssl_module modules/mod_ssl.so +LoadModule unixd_module modules/mod_unixd.so +LoadModule status_module modules/mod_status.so +LoadModule autoindex_module modules/mod_autoindex.so +LoadModule dir_module modules/mod_dir.so +LoadModule alias_module modules/mod_alias.so + + +User www-data +Group www-data + + + + AllowOverride none + Require all denied + + +DocumentRoot "/usr/local/apache2/htdocs" + + Options Indexes FollowSymLinks + AllowOverride None + Require all granted + + + + DirectoryIndex index.html + + + + Require all denied + + +ErrorLog /proc/self/fd/2 + +LogLevel warn + + + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + + # You need to enable mod_logio.c to use %I and %O + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio + + + CustomLog /proc/self/fd/1 common + + + + ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/" + + + + AllowOverride None + Options None + Require all granted + + + + RequestHeader unset Proxy early + + + + TypesConfig conf/mime.types + AddType application/x-compress .Z + AddType application/x-gzip .gz .tgz + + + +Include conf/extra/proxy-html.conf + + +# Secure (SSL/TLS) connections +Include conf/extra/httpd-ssl.conf + +SSLRandomSeed startup builtin +SSLRandomSeed connect builtin + \ No newline at end of file diff --git a/test/awsneuron/resources/metrics_list.go b/test/awsneuron/resources/metrics_list.go new file mode 100644 index 000000000..ce4094597 --- /dev/null +++ b/test/awsneuron/resources/metrics_list.go @@ -0,0 +1,48 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package resources + +const ( + ContainerNeuronCoreUtil = "container_neuroncore_utilization" + ContainerNeuronCoreMemUsageConstants = "container_neuroncore_memory_usage_constants" + ContainerNeuronCoreMemUsageModel = "container_neuroncore_memory_usage_model_code" + ContainerNeuronCoreMemUsageScratchpad = "container_neuroncore_memory_usage_model_shared_scratchpad" + ContainerNeuronCoreMemUsageRuntime = "container_neuroncore_memory_usage_runtime_memory" + ContainerNeuronCoreMemUsageTensors = "container_neuroncore_memory_usage_tensors" + ContainerNeuronCoreMemUsageTotal = "container_neuroncore_memory_usage_total" + ContainerNeuronDeviceHwEccEvents = "container_neurondevice_hw_ecc_events_total" + + PodNeuronCoreUtil = "pod_neuroncore_utilization" + PodNeuronCoreMemUsageConstants = "pod_neuroncore_memory_usage_constants" + PodNeuronCoreMemUsageModel = "pod_neuroncore_memory_usage_model_code" + PodNeuronCoreMemUsageScratchpad = "pod_neuroncore_memory_usage_model_shared_scratchpad" + PodNeuronCoreMemUsageRuntime = "pod_neuroncore_memory_usage_runtime_memory" + PodNeuronCoreMemUsageTensors = "pod_neuroncore_memory_usage_tensors" + PodNeuronCoreMemUsageTotal = "pod_neuroncore_memory_usage_total" + PodNeuronDeviceHwEccEvents = "pod_neurondevice_hw_ecc_events_total" + + NodeNeuronCoreUtil = "node_neuroncore_utilization" + NodeNeuronCoreMemUsageConstants = "node_neuroncore_memory_usage_constants" + NodeNeuronCoreMemUsageModel = "node_neuroncore_memory_usage_model_code" + NodeNeuronCoreMemUsageScratchpad = "node_neuroncore_memory_usage_model_shared_scratchpad" + NodeNeuronCoreMemUsageRuntime = "node_neuroncore_memory_usage_runtime_memory" + NodeNeuronCoreMemUsageTensors = "node_neuroncore_memory_usage_tensors" + NodeNeuronCoreMemUsageTotal = "node_neuroncore_memory_usage_total" + NodeNeuronDeviceHwEccEvents = "node_neurondevice_hw_ecc_events_total" + NodeExecutionErrorsTotal = "node_neuron_execution_errors_total" + NodeExecutionErrorsGeneric = "node_neuron_execution_errors_generic" + NodeExecutionErrorsNumerical = "node_neuron_execution_errors_numerical" + NodeExecutionErrorsTransient = "node_neuron_execution_errors_transient" + NodeExecutionErrorsModel = "node_neuron_execution_errors_model" + NodeExecutionErrorsRuntime = "node_neuron_execution_errors_runtime" + NodeExecutionErrorsHardware = "node_neuron_execution_errors_hardware" + NodeExecutionStatusCompleted = "node_neuron_execution_status_completed" + NodeExecutionStatusTimedOut = "node_neuron_execution_status_timed_out" + NodeExecutionStatusCompletedWithErr = "node_neuron_execution_status_completed_with_err" + NodeExecutionStatusCompletedWithNumErr = "node_neuron_execution_status_completed_with_num_err" + NodeExecutionStatusIncorrectInput = "node_neuron_execution_status_incorrect_input" + NodeExecutionStatusFailedToQueue = "node_neuron_execution_status_failed_to_queue" + NodeNeuronDeviceRuntimeMemoryUsed = "node_neurondevice_runtime_memory_used_bytes" + NodeNeuronExecutionLatency = "node_neuron_execution_latency" +) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index ba4ed33bc..a61a3daf5 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -228,3 +228,49 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { testResult.Status = status.SUCCESSFUL return testResult } + +func ValidateLogsFrequency(env *environment.MetaData) status.TestResult { + + testResult := status.TestResult{ + Name: "emf-logs-frequency", + Status: status.FAILED, + } + + end := time.Now().Add(time.Duration(-2) * time.Minute).Truncate(time.Minute) + start := end.Add(time.Duration(-1) * time.Minute) + group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName) + + // need to get the instances used for the EKS cluster + eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName) + if err != nil { + log.Println("failed to get EKS instances", err) + return testResult + } + + for _, instance := range eKSInstances { + stream := *instance.InstanceName + frequencyMap, err := awsservice.GetLogEventCountPerType(group, stream, &start, &end) + + for logType, expectedFrequency := range eks_resources.EksClusterFrequencyValidationMap { + log.Printf("logs with no logtype : %d", frequencyMap[awsservice.NoLogTypeFound]) + + actualFrequency, ok := frequencyMap[logType] + if !ok { + log.Printf("no log with the expected logtype found : %s, start time : %s", logType, start.GoString()) + return testResult + } + if actualFrequency != expectedFrequency { + log.Printf("log frequency validation failed for type: %s, expected: %d, actual: %d, start time: %s", logType, expectedFrequency, actualFrequency, start.GoString()) + return testResult + } + } + + if err != nil { + log.Printf("log validation (%s/%s) failed: %v, start time : %s", group, stream, err, start) + return testResult + } + } + + testResult.Status = status.SUCCESSFUL + return testResult +} diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json new file mode 100644 index 000000000..c253fee52 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_neuroncore.json @@ -0,0 +1,50 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "ContainerName": {}, + "FullPodName": {}, + "InstanceId": {}, + "InstanceType": {}, + "K8sPodName": {}, + "NeuronCore": {}, + "NeuronDevice": {}, + "NodeName": {}, + "PodName": {}, + "Service": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "availability_zone": {}, + "kubernetes": {}, + "region": {}, + "subnet_id": {}, + "container_neuroncore_memory_usage_constants": {}, + "container_neuroncore_memory_usage_model_code": {}, + "container_neuroncore_memory_usage_model_shared_scratchpad": {}, + "container_neuroncore_memory_usage_runtime_memory": {}, + "container_neuroncore_memory_usage_tensors": {}, + "container_neuroncore_memory_usage_total": {}, + "container_neuroncore_utilization": {} + }, + "required": [ + "ClusterName", + "ContainerName", + "FullPodName", + "InstanceId", + "InstanceType", + "NeuronCore", + "NeuronDevice", + "NodeName", + "PodName", + "Service", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json new file mode 100644 index 000000000..b6c30b2da --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json @@ -0,0 +1,33 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "InstanceId": {}, + "InstanceType": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "availability_zone": {}, + "kubernetes": {}, + "region": {}, + "subnet_id": {}, + "node_neuron_execution_errors_total": {}, + "node_neuron_execution_latency": {}, + "node_neurondevice_runtime_memory_used_bytes": {} + }, + "required": [ + "ClusterName", + "InstanceId", + "InstanceType", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json new file mode 100644 index 000000000..3e9a04319 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json @@ -0,0 +1,42 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "InstanceId": {}, + "InstanceType": {}, + "Namespace": {}, + "NeuronCore": {}, + "NeuronDevice": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "availability_zone": {}, + "kubernetes": {}, + "region": {}, + "subnet_id": {}, + "node_neuroncore_memory_usage_constants": {}, + "node_neuroncore_memory_usage_model_code": {}, + "node_neuroncore_memory_usage_model_shared_scratchpad": {}, + "node_neuroncore_memory_usage_runtime_memory": {}, + "node_neuroncore_memory_usage_tensors": {}, + "node_neuroncore_memory_usage_total": {}, + "node_neuroncore_utilization": {} + }, + "required": [ + "ClusterName", + "InstanceId", + "InstanceType", + "NeuronCore", + "NeuronDevice", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json new file mode 100644 index 000000000..30b642531 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neurondevice.json @@ -0,0 +1,34 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "InstanceId": {}, + "InstanceType": {}, + "NeuronDevice": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "availability_zone": {}, + "kubernetes": {}, + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": {}, + "region": {}, + "subnet_id": {}, + "node_neurondevice_hw_ecc_events_total": {} + }, + "required": [ + "ClusterName", + "InstanceId", + "InstanceType", + "NeuronDevice", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json new file mode 100644 index 000000000..9ba632799 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_neuroncore.json @@ -0,0 +1,49 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "FullPodName": {}, + "InstanceId": {}, + "InstanceType": {}, + "K8sPodName": {}, + "NeuronCore": {}, + "NeuronDevice": {}, + "NodeName": {}, + "PodName": {}, + "Service": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "availability_zone": {}, + "kubernetes": {}, + "region": {}, + "subnet_id": {}, + "pod_neuroncore_memory_usage_constants": {}, + "pod_neuroncore_memory_usage_model_code": {}, + "pod_neuroncore_memory_usage_model_shared_scratchpad": {}, + "pod_neuroncore_memory_usage_runtime_memory": {}, + "pod_neuroncore_memory_usage_tensors": {}, + "pod_neuroncore_memory_usage_total": {}, + "pod_neuroncore_utilization": {} + }, + "required": [ + "ClusterName", + "ContainerName", + "FullPodName", + "InstanceId", + "InstanceType", + "NeuronCore", + "NeuronDevice", + "NodeName", + "PodName", + "Service", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index 3ea59a1a0..bd737f85b 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -46,26 +46,47 @@ var ( eksNodeGpuSchema string //go:embed test_schemas/cluster_gpu.json eksClusterGpuSchema string + //go:embed test_schemas/container_neuroncore.json + eksContainerNeuronCoreSchema string + //go:embed test_schemas/pod_neuroncore.json + eksPodNeuronCoreSchema string + //go:embed test_schemas/node_neuroncore.json + eksNodeNeuronCoreSchema string + //go:embed test_schemas/node_neurondevice.json + eksNodeNeuronDeviceSchema string + //go:embed test_schemas/node_neuron.json + eksNodeNeuronSchema string EksClusterValidationMap = map[string]string{ - "Cluster": eksClusterSchema, - "ClusterDaemonSet": eksClusterDaemonsetSchema, - "ClusterDeployment": eksClusterDeploymentSchema, - "ClusterNamespace": eksClusterNamespaceSchema, - "ClusterService": eksClusterServiceSchema, - "Container": eksContainerSchema, - "ContainerFS": eksContainerFSSchema, - "ControlPlane": eksControlPlaneSchema, - "Node": eksNodeSchema, - "NodeDiskIO": eksNodeDiskIOSchema, - "NodeFS": eksNodeFSSchema, - "NodeNet": eksNodeNetSchema, - "Pod": eksPodSchema, - "PodNet": eksPodNetSchema, - "ContainerGPU": eksContainerGpuSchema, - "PodGPU": eksPodGpuSchema, - "NodeGPU": eksNodeGpuSchema, - "ClusterGPU": eksClusterGpuSchema, + "Cluster": eksClusterSchema, + "ClusterDaemonSet": eksClusterDaemonsetSchema, + "ClusterDeployment": eksClusterDeploymentSchema, + "ClusterNamespace": eksClusterNamespaceSchema, + "ClusterService": eksClusterServiceSchema, + "Container": eksContainerSchema, + "ContainerFS": eksContainerFSSchema, + "ControlPlane": eksControlPlaneSchema, + "Node": eksNodeSchema, + "NodeDiskIO": eksNodeDiskIOSchema, + "NodeFS": eksNodeFSSchema, + "NodeNet": eksNodeNetSchema, + "Pod": eksPodSchema, + "PodNet": eksPodNetSchema, + "ContainerGPU": eksContainerGpuSchema, + "PodGPU": eksPodGpuSchema, + "NodeGPU": eksNodeGpuSchema, + "ClusterGPU": eksClusterGpuSchema, + "ContainerAWSNeuronCore": eksContainerNeuronCoreSchema, + "PodAWSNeuronCore": eksPodNeuronCoreSchema, + "NodeAWSNeuronCore": eksNodeNeuronCoreSchema, + "NodeAWSNeuronDevice": eksNodeNeuronDeviceSchema, + "NodeAWSNeuron": eksNodeNeuronSchema, + } + + EksClusterFrequencyValidationMap = map[string]int{ + "NodeAWSNeuronCore": 32, + "NodeAWSNeuronDevice": 16, + "NodeAWSNeuron": 1, } ) diff --git a/util/awsservice/cloudwatchlogs.go b/util/awsservice/cloudwatchlogs.go index 10a9f830b..c801b7880 100644 --- a/util/awsservice/cloudwatchlogs.go +++ b/util/awsservice/cloudwatchlogs.go @@ -5,6 +5,7 @@ package awsservice import ( "context" + "encoding/json" "errors" "fmt" "log" @@ -20,6 +21,7 @@ import ( const ( logStreamRetry = 20 retryInterval = 10 * time.Second + NoLogTypeFound = "NoLogTypeFound" ) // catch ResourceNotFoundException when deleting the log group and log stream, as these @@ -320,3 +322,27 @@ func AssertNoDuplicateLogs() LogEventsValidator { return nil } } + +func GetLogEventCountPerType(logGroup, logStream string, since, until *time.Time) (map[string]int, error) { + var typeFrequency = make(map[string]int) + events, err := getLogsSince(logGroup, logStream, since, until) + + // if there is an error, return the empty map + if err != nil { + return typeFrequency, err + } + + typeFrequency[NoLogTypeFound] = 0 + for _, event := range events { + message := *event.Message + var eksClusterType EKSClusterType + innerErr := json.Unmarshal([]byte(message), &eksClusterType) + if innerErr != nil { + typeFrequency[NoLogTypeFound]++ + } + + typeFrequency[eksClusterType.Type]++ + } + + return typeFrequency, nil +}