Skip to content

Commit

Permalink
Merge pull request #1320 from bharathappali/gpu-support-pr-6
Browse files Browse the repository at this point in the history
Adds Accelerator recommendation generation logic
  • Loading branch information
dinogun authored Oct 8, 2024
2 parents 9014bbd + c6418e5 commit 8a8753e
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,9 @@ public static class PercentileConstants {
public static final Integer TWENTYFIVE_PERCENTILE = 25;
public static final Integer SEVENTYFIVE_PERCENTILE = 75;
public static final Integer FIFTY_PERCENTILE = 50;
public static final Integer COST_ACCELERATOR_PERCENTILE = 60;
public static final Integer PERFORMANCE_ACCELERATOR_PERCENTILE = 98;

}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,7 @@ private MappedRecommendationForModel generateRecommendationBasedOnModel(Timestam
// Get the Recommendation Items
RecommendationConfigItem recommendationCpuRequest = model.getCPURequestRecommendation(filteredResultsMap, notifications);
RecommendationConfigItem recommendationMemRequest = model.getMemoryRequestRecommendation(filteredResultsMap, notifications);
Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> recommendationAcceleratorRequestMap = model.getAcceleratorRequestRecommendation(filteredResultsMap, notifications);

// Get the Recommendation Items
// Calling requests on limits as we are maintaining limits and requests as same
Expand Down Expand Up @@ -728,7 +729,8 @@ private MappedRecommendationForModel generateRecommendationBasedOnModel(Timestam
internalMapToPopulate,
numPods,
cpuThreshold,
memoryThreshold
memoryThreshold,
recommendationAcceleratorRequestMap
);
} else {
RecommendationNotification notification = new RecommendationNotification(
Expand Down Expand Up @@ -1077,7 +1079,8 @@ private MappedRecommendationForModel generateNamespaceRecommendationBasedOnModel
internalMapToPopulate,
numPodsInNamespace,
namespaceCpuThreshold,
namespaceMemoryThreshold
namespaceMemoryThreshold,
null
);
} else {
RecommendationNotification notification = new RecommendationNotification(
Expand All @@ -1100,13 +1103,17 @@ private MappedRecommendationForModel generateNamespaceRecommendationBasedOnModel
* @param numPods The number of pods to consider for the recommendation.
* @param cpuThreshold The CPU usage threshold for the recommendation.
* @param memoryThreshold The memory usage threshold for the recommendation.
* @param recommendationAcceleratorRequestMap The Map which has Accelerator recommendations
* @return {@code true} if the internal map was successfully populated; {@code false} otherwise.
*/
private boolean populateRecommendation(Map.Entry<String, Terms> termEntry,
MappedRecommendationForModel recommendationModel,
ArrayList<RecommendationNotification> notifications,
HashMap<String, RecommendationConfigItem> internalMapToPopulate,
int numPods, double cpuThreshold, double memoryThreshold) {
int numPods,
double cpuThreshold,
double memoryThreshold,
Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> recommendationAcceleratorRequestMap) {
// Check for cpu & memory Thresholds (Duplicate check if the caller is generateRecommendations)
String recommendationTerm = termEntry.getKey();
double hours = termEntry.getValue().getDays() * KruizeConstants.TimeConv.NO_OF_HOURS_PER_DAY * KruizeConstants.TimeConv.
Expand Down Expand Up @@ -1690,6 +1697,11 @@ private boolean populateRecommendation(Map.Entry<String, Terms> termEntry,
config.put(AnalyzerConstants.ResourceSetting.requests, requestsMap);
}

// Check if accelerator map is not empty and add to limits map
if (null != recommendationAcceleratorRequestMap && !recommendationAcceleratorRequestMap.isEmpty()) {
limitsMap.putAll(recommendationAcceleratorRequestMap);
}

// Set Limits Map
if (!limitsMap.isEmpty()) {
config.put(AnalyzerConstants.ResourceSetting.limits, limitsMap);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
import com.autotune.analyzer.recommendations.RecommendationNotification;
import com.autotune.analyzer.recommendations.utils.RecommendationUtils;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.metrics.AcceleratorMetricResult;
import com.autotune.common.data.metrics.MetricAggregationInfoResults;
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.IntervalResults;
import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorMetaDataService;
import com.autotune.common.data.system.info.device.accelerator.metadata.AcceleratorProfile;
import com.autotune.common.utils.CommonUtils;
import com.autotune.utils.KruizeConstants;
import org.json.JSONArray;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.services.cloudwatchlogs.endpoints.internal.Value;

import java.sql.Timestamp;
import java.util.*;
Expand All @@ -22,6 +27,8 @@

import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_CPU_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_MEMORY_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.COST_ACCELERATOR_PERCENTILE;

import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationValueConstants.*;

public class CostBasedRecommendationModel implements RecommendationModel {
Expand Down Expand Up @@ -505,6 +512,80 @@ public RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map<T
return recommendationConfigItem;
}

@Override
public Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAcceleratorRequestRecommendation (
Map<Timestamp, IntervalResults> filteredResultsMap,
ArrayList<RecommendationNotification> notifications
) {
List<Double> acceleratorCoreMaxValues = new ArrayList<>();
List<Double> acceleratorMemoryMaxValues = new ArrayList<>();

boolean isGpuWorkload = false;
String acceleratorModel = null;

for (Map.Entry<Timestamp, IntervalResults> entry : filteredResultsMap.entrySet()) {
IntervalResults intervalResults = entry.getValue();

// Skip if accelerator map is null
if (null == intervalResults.getAcceleratorMetricResultHashMap())
continue;

isGpuWorkload = true;
for (Map.Entry<AnalyzerConstants.MetricName, AcceleratorMetricResult> gpuEntry : intervalResults.getAcceleratorMetricResultHashMap().entrySet()) {
AcceleratorMetricResult gpuMetricResult = gpuEntry.getValue();

// Set Accelerator name
// TODO: Need to handle separate processing in case of container supporting multiple accelerators
if (null == acceleratorModel
&& null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
&& !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
&& CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
) {
String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
if (null != obtainedAcceleratorName)
acceleratorModel = obtainedAcceleratorName;
}

MetricResults metricResults = gpuMetricResult.getMetricResults();

// Skip if metric results is null
if (null == metricResults || null == metricResults.getAggregationInfoResult())
continue;

MetricAggregationInfoResults aggregationInfo = metricResults.getAggregationInfoResult();

// Skip if max is null or zero or negative
if (null == aggregationInfo.getMax() || aggregationInfo.getMax() <= 0.0)
continue;

boolean isCoreUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuCoreUsage;
boolean isMemoryUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuMemoryUsage;

// Skip if it's none of the Accelerator metrics
if (!isCoreUsage && !isMemoryUsage)
continue;

if (isCoreUsage) {
acceleratorCoreMaxValues.add(aggregationInfo.getMax());
} else {
acceleratorMemoryMaxValues.add(aggregationInfo.getMax());
}
}
}

if (!isGpuWorkload) {
return null;
}

double coreAverage = CommonUtils.percentile(COST_ACCELERATOR_PERCENTILE, acceleratorCoreMaxValues);
double memoryAverage = CommonUtils.percentile(COST_ACCELERATOR_PERCENTILE, acceleratorMemoryMaxValues);

double coreFraction = coreAverage / 100;
double memoryFraction = memoryAverage / 100;

return RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);
}

public static JSONObject calculateNamespaceMemoryUsage(IntervalResults intervalResults) {
// create a JSON object which should be returned here having two values, Math.max and Collections.Min
JSONObject jsonObject = new JSONObject();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationConstants;
import com.autotune.analyzer.recommendations.RecommendationNotification;
import com.autotune.analyzer.recommendations.utils.RecommendationUtils;
import com.autotune.analyzer.services.UpdateRecommendations;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.metrics.AcceleratorMetricResult;
import com.autotune.common.data.metrics.MetricAggregationInfoResults;
import com.autotune.common.data.metrics.MetricResults;
import com.autotune.common.data.result.IntervalResults;
Expand All @@ -19,8 +21,8 @@
import java.util.*;
import java.util.stream.Collectors;

import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_CPU_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_MEMORY_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.*;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationEngine.PercentileConstants.PERFORMANCE_ACCELERATOR_PERCENTILE;
import static com.autotune.analyzer.recommendations.RecommendationConstants.RecommendationValueConstants.*;

public class PerformanceBasedRecommendationModel implements RecommendationModel {
Expand Down Expand Up @@ -372,6 +374,76 @@ public RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map<T
return recommendationConfigItem;
}

@Override
public Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAcceleratorRequestRecommendation(Map<Timestamp, IntervalResults> filteredResultsMap, ArrayList<RecommendationNotification> notifications) {
List<Double> acceleratorCoreMaxValues = new ArrayList<>();
List<Double> acceleratorMemoryMaxValues = new ArrayList<>();

boolean isGpuWorkload = false;
String acceleratorModel = null;

for (Map.Entry<Timestamp, IntervalResults> entry : filteredResultsMap.entrySet()) {
IntervalResults intervalResults = entry.getValue();

// Skip if accelerator map is null
if (null == intervalResults.getAcceleratorMetricResultHashMap())
continue;

isGpuWorkload = true;
for (Map.Entry<AnalyzerConstants.MetricName, AcceleratorMetricResult> gpuEntry : intervalResults.getAcceleratorMetricResultHashMap().entrySet()) {
AcceleratorMetricResult gpuMetricResult = gpuEntry.getValue();

// Set Accelerator name
if (null == acceleratorModel
&& null != gpuMetricResult.getAcceleratorDeviceData().getModelName()
&& !gpuMetricResult.getAcceleratorDeviceData().getModelName().isEmpty()
&& CommonUtils.checkIfModelIsKruizeSupportedMIG(gpuMetricResult.getAcceleratorDeviceData().getModelName())
) {
String obtainedAcceleratorName = CommonUtils.getSupportedModelBasedOnModelName(gpuMetricResult.getAcceleratorDeviceData().getModelName());
if (null != obtainedAcceleratorName)
acceleratorModel = obtainedAcceleratorName;
}

MetricResults metricResults = gpuMetricResult.getMetricResults();

// Skip if metric results is null
if (null == metricResults || null == metricResults.getAggregationInfoResult())
continue;

MetricAggregationInfoResults aggregationInfo = metricResults.getAggregationInfoResult();

// Skip if max is null or zero or negative
if (null == aggregationInfo.getMax() || aggregationInfo.getMax() <= 0.0)
continue;

boolean isCoreUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuCoreUsage;
boolean isMemoryUsage = gpuEntry.getKey() == AnalyzerConstants.MetricName.gpuMemoryUsage;

// Skip if it's none of the Accelerator metrics
if (!isCoreUsage && !isMemoryUsage)
continue;

if (isCoreUsage) {
acceleratorCoreMaxValues.add(aggregationInfo.getMax());
} else {
acceleratorMemoryMaxValues.add(aggregationInfo.getMax());
}
}
}

if (!isGpuWorkload) {
return null;
}

double coreAverage = CommonUtils.percentile(PERFORMANCE_ACCELERATOR_PERCENTILE, acceleratorCoreMaxValues);
double memoryAverage = CommonUtils.percentile(PERFORMANCE_ACCELERATOR_PERCENTILE, acceleratorMemoryMaxValues);

double coreFraction = coreAverage / 100;
double memoryFraction = memoryAverage / 100;

return RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);
}

@Override
public String getModelName() {
return this.name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.autotune.analyzer.recommendations.RecommendationConfigItem;
import com.autotune.analyzer.recommendations.RecommendationNotification;
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.data.result.IntervalResults;

import java.sql.Timestamp;
Expand All @@ -17,6 +18,8 @@ public interface RecommendationModel {
// get namespace recommendations for Memory Request
RecommendationConfigItem getMemoryRequestRecommendationForNamespace(Map<Timestamp, IntervalResults> filteredResultsMap, ArrayList<RecommendationNotification> notifications);

Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAcceleratorRequestRecommendation(Map<Timestamp, IntervalResults> filteredResultsMap, ArrayList<RecommendationNotification> notifications);

public String getModelName();
void validate();

Expand Down
Loading

0 comments on commit 8a8753e

Please sign in to comment.