Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bpf): use time window for bpf sampling to replace per call based sampling #1723

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 46 additions & 20 deletions bpf/kepler.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,30 @@ SEC(".rodata.config")
__attribute__((btf_decl_tag(
"Hardware Events Enabled"))) static volatile const int HW = 1;

// The sampling rate should be disabled by default because its impact on the
// measurements is unknown.
// Global parameters for tracking periods (in milli seconds)
SEC(".rodata.config")
__attribute__((
btf_decl_tag("Sample Rate"))) static volatile const int SAMPLE_RATE = 0;
__attribute__((btf_decl_tag(
"Active Time"))) static volatile const int ACTIVE_TIME = 20;

// Global parameters for non-tracking periods (in milli seconds)
SEC(".rodata.config")
__attribute__((btf_decl_tag("Idle Time"))) static volatile const int IDLE_TIME = 80;

// BPF map to track whether we are in the tracking period or not
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider using a PERCPU_ARRAY since this would have the effect of making the time window per-cpu also, which may be desirable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to PERCPU_ARRAY

__type(key, u32);
__type(value, u32);
__uint(max_entries, 1);
} tracking_flag_map SEC(".maps");

// BPF map to store the timestamp when the tracking started
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 1);
} start_time_map SEC(".maps");

int counter_sched_switch = 0;

Expand Down Expand Up @@ -317,24 +336,31 @@ static inline int do_kepler_sched_switch_trace(

cpu_id = bpf_get_smp_processor_id();

// Skip some samples to minimize overhead
if (SAMPLE_RATE > 0) {
if (counter_sched_switch > 0) {
// update hardware counters to be used when sample is taken
if (counter_sched_switch == 1) {
collect_metrics_and_reset_counters(
&buf, prev_pid, curr_ts, cpu_id);
// Add task on-cpu running start time
bpf_map_update_elem(
&pid_time_map, &next_pid, &curr_ts,
BPF_ANY);
// create new process metrics
register_new_process_if_not_exist(next_tgid);
}
counter_sched_switch--;
// Retrieve tracking flag and start time
u32 key = 0;
u32 *tracking_flag = bpf_map_lookup_elem(&tracking_flag_map, &key);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given that map lookups are the most expensive part of the eBPF code it would be better to reduce them where possible. there's no reason to store tracking_flag in a map as far as I can tell since it's value doesn't need to persist between invocations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a thinking of using kepler userspace program to set the tracking flag. The actual mechanism is not quite clear yet. Will remove this map if that is a dead end.

u64 *start_time = bpf_map_lookup_elem(&start_time_map, &key);

if (tracking_flag && start_time) {
u64 elapsed_time = (curr_ts - *start_time) / 1000000ULL;

// Update the tracking flag based on elapsed time
if (*tracking_flag && elapsed_time >= ACTIVE_TIME) {
// Stop tracking
*tracking_flag = 0;
// Reset start time
*start_time = curr_ts;
} else if (!*tracking_flag && elapsed_time >= IDLE_TIME) {
// Start tracking
*tracking_flag = 1;
// Reset start time
*start_time = curr_ts;
}

// If we are not in the tracking period, return immediately
if (!*tracking_flag) {
return 0;
}
counter_sched_switch = SAMPLE_RATE;
}

collect_metrics_and_reset_counters(&buf, prev_pid, curr_ts, cpu_id);
Expand Down
8 changes: 7 additions & 1 deletion pkg/bpf/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,13 @@ func (e *exporter) attach() error {

// Set program global variables
err = specs.RewriteConstants(map[string]interface{}{
"SAMPLE_RATE": int32(config.GetBPFSampleRate()),
"ACTIVE_TIME": int32(config.GetBPFActiveSampleWindowMS()),
})
if err != nil {
return fmt.Errorf("error rewriting program constants: %v", err)
}
err = specs.RewriteConstants(map[string]interface{}{
"IDLE_TIME": int32(config.GetBPFIdleSampleWindowMS()),
})
if err != nil {
return fmt.Errorf("error rewriting program constants: %v", err)
Expand Down
6 changes: 6 additions & 0 deletions pkg/bpf/kepler_bpfeb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/bpf/kepler_bpfeb.o
Binary file not shown.
6 changes: 6 additions & 0 deletions pkg/bpf/kepler_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/bpf/kepler_bpfel.o
Binary file not shown.
3 changes: 2 additions & 1 deletion pkg/bpftest/bpf_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,8 @@ var _ = Describe("BPF Exporter", func() {

err = specs.RewriteConstants(map[string]interface{}{
"TEST": int32(1),
"SAMPLE_RATE": int32(1000),
"ACTIVE_TIME": int32(1000),
"IDLE_TIME": int32(0),
})
Expect(err).NotTo(HaveOccurred())

Expand Down
6 changes: 6 additions & 0 deletions pkg/bpftest/test_bpfeb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/bpftest/test_bpfeb.o
Binary file not shown.
6 changes: 6 additions & 0 deletions pkg/bpftest/test_bpfel.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/bpftest/test_bpfel.o
Binary file not shown.
29 changes: 21 additions & 8 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,12 @@ type KeplerConfig struct {
MockACPIPowerPath string
MaxLookupRetry int
KubeConfig string
BPFSampleRate int
EstimatorModel string
EstimatorSelectFilter string
CPUArchOverride string
MachineSpecFilePath string
BPFActiveSampleWindowMS int
BPFIdleSampleWindowMS int
}
type MetricsConfig struct {
CoreUsageMetric string
Expand Down Expand Up @@ -154,10 +155,11 @@ func getKeplerConfig() KeplerConfig {
MockACPIPowerPath: getConfig("MOCK_ACPI_POWER_PATH", ""),
MaxLookupRetry: getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry),
KubeConfig: getConfig("KUBE_CONFIG", defaultKubeConfig),
BPFSampleRate: getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", defaultBPFSampleRate),
EstimatorModel: getConfig("ESTIMATOR_MODEL", defaultMetricValue),
EstimatorSelectFilter: getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue), // no filter
CPUArchOverride: getConfig("CPU_ARCH_OVERRIDE", defaultCPUArchOverride),
BPFActiveSampleWindowMS: getIntConfig("EXPERIMENTAL_BPF_ACTIVE_SAMPLE_WINDOW_MS", 1000),
BPFIdleSampleWindowMS: getIntConfig("EXPERIMENTAL_BPF_IDLE_SAMPLE_WINDOW_MS", 0),
}
}

Expand Down Expand Up @@ -261,7 +263,8 @@ func logBoolConfigs() {
klog.V(5).Infof("EXPOSE_BPF_METRICS: %t", instance.Kepler.ExposeBPFMetrics)
klog.V(5).Infof("EXPOSE_COMPONENT_POWER: %t", instance.Kepler.ExposeComponentPower)
klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics)
klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate)
klog.V(5).Infof("EXPERIMENTAL_BPF_ACTIVE_SAMPLE_WINDOW_MS: %d", instance.Kepler.BPFActiveSampleWindowMS)
klog.V(5).Infof("EXPERIMENTAL_BPF_IDLE_SAMPLE_WINDOW_MS: %d", instance.Kepler.BPFIdleSampleWindowMS)
}
}

Expand Down Expand Up @@ -395,6 +398,21 @@ func SetGPUUsageMetric(metric string) {
instance.Metrics.GPUUsageMetric = metric
}

func GetBPFActiveSampleWindowMS() int {
ensureConfigInitialized()
return instance.Kepler.BPFActiveSampleWindowMS
}

func GetBPFIdleSampleWindowMS() int {
ensureConfigInitialized()
return instance.Kepler.BPFIdleSampleWindowMS
}

func GetDCGMHostEngineEndpoint() string {
ensureConfigInitialized()
return instance.DCGMHostEngineEndpoint
}

func (c *Config) getUnixName() (unix.Utsname, error) {
var utsname unix.Utsname
err := unix.Uname(&utsname)
Expand Down Expand Up @@ -552,11 +570,6 @@ func ExposeIRQCounterMetrics() bool {
return instance.Kepler.ExposeIRQCounterMetrics
}

func GetBPFSampleRate() int {
ensureConfigInitialized()
return instance.Kepler.BPFSampleRate
}

func GetRedfishCredFilePath() string {
ensureConfigInitialized()
return instance.Redfish.CredFilePath
Expand Down
4 changes: 2 additions & 2 deletions pkg/sensors/accelerator/device/sources/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ type GPUDcgm struct {
}

func init() {
if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket); err != nil {
if _, err := dcgm.Init(dcgm.Standalone, config.GetDCGMHostEngineEndpoint(), isSocket); err != nil {
klog.Errorf("Error initializing dcgm: %v", err)
return
}
Expand Down Expand Up @@ -131,7 +131,7 @@ func (d *GPUDcgm) InitLib() (err error) {
err = fmt.Errorf("could not init dcgm: %v", r)
}
}()
cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket)
cleanup, err := dcgm.Init(dcgm.Standalone, config.GetDCGMHostEngineEndpoint(), isSocket)
if err != nil {
klog.Infof("There is no DCGM daemon running in the host: %s", err)
// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
Expand Down
Loading