Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add gpu info to health state [TAB-162] #364

Merged
merged 15 commits into from
Aug 21, 2023
89 changes: 0 additions & 89 deletions crates/tabby/src/serve/context.rs

This file was deleted.

45 changes: 40 additions & 5 deletions crates/tabby/src/serve/health.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::{env::consts::ARCH, sync::Arc};

use anyhow::{Ok, Result};
use axum::{extract::State, Json};
use nvml_wrapper::Nvml;
use serde::{Deserialize, Serialize};
use sysinfo::{CpuExt, System, SystemExt};
use utoipa::ToSchema;

use super::context::TabbyContext;

#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
pub struct HealthState {
model: String,
Expand All @@ -19,10 +20,10 @@ pub struct HealthState {
}

impl HealthState {
pub fn new(args: &super::ServeArgs, context: &mut TabbyContext) -> Self {
let cpu_stats = context.cpu_stats_manager.get_stats();
pub fn new(args: &super::ServeArgs) -> Self {
let cpu_stats = get_cpu_stats();

let gpu_info_res = context.gpu_stats_manager.get_stats();
let gpu_info_res = get_gpu_stats();
let gpu_info = match gpu_info_res {
Ok(s) => s,
Err(_) => vec![],
Expand All @@ -41,6 +42,40 @@ impl HealthState {
}
}

pub struct CPUStat {
pub info: String,
pub count: usize,
}

fn get_cpu_stats() -> CPUStat {
let mut system = System::new_all();
let cpus = system.cpus();
let count = cpus.len();
let info = if count > 0 {
let cpu = cpus[0];
cpu.brand().to_string()
} else {
"unknown".to_string()
};

CPUStat { info, count }
vodkaslime marked this conversation as resolved.
Show resolved Hide resolved
}

fn get_gpu_stats() -> Result<Vec<String>> {
vodkaslime marked this conversation as resolved.
Show resolved Hide resolved
// In cases of MacOS or docker containers where --gpus are not specified,
// the Nvml::init() would return an error. In these scenarios, we
// assign gpu_info to be empty, indicating that the current runtime
// environment does not support cuda interface.
let nvml = Nvml::init()?;
let mut gpu_info = vec![];
let device_count = nvml.device_count()?;
for i in 0..device_count {
let name = nvml.device_by_index(i)?.name()?;
gpu_info.push(name);
}
Ok(gpu_info)
}

#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
pub struct Version {
build_date: String,
Expand Down
17 changes: 7 additions & 10 deletions crates/tabby/src/serve/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
mod completions;
mod context;
mod events;
mod health;

Expand All @@ -19,7 +18,7 @@ use tracing::info;
use utoipa::OpenApi;
use utoipa_swagger_ui::SwaggerUi;

use self::{context::TabbyContext, health::HealthState};
use self::health::HealthState;
use crate::fatal;

#[derive(OpenApi)]
Expand Down Expand Up @@ -114,7 +113,6 @@ pub struct ServeArgs {

pub async fn main(config: &Config, args: &ServeArgs) {
valid_args(args);
let mut context = TabbyContext::new();

// Ensure model exists.
tabby_download::download_model(&args.model, true)
Expand All @@ -129,26 +127,25 @@ pub async fn main(config: &Config, args: &ServeArgs) {

let app = Router::new()
.merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", ApiDoc::openapi()))
.nest("/v1", api_router(args, config, &mut context))
.nest("/v1", api_router(args, config))
.fallback(fallback());

let address = SocketAddr::from((Ipv4Addr::UNSPECIFIED, args.port));
info!("Listening at {}", address);

start_heartbeat(args, &mut context);
start_heartbeat(args);
Server::bind(&address)
.serve(app.into_make_service())
.await
.unwrap_or_else(|err| fatal!("Error happens during serving: {}", err))
}

fn api_router(args: &ServeArgs, config: &Config, context: &mut TabbyContext) -> Router {
fn api_router(args: &ServeArgs, config: &Config) -> Router {
Router::new()
.route("/events", routing::post(events::log_event))
.route(
"/health",
routing::post(health::health)
.with_state(Arc::new(health::HealthState::new(args, context))),
routing::post(health::health).with_state(Arc::new(health::HealthState::new(args))),
)
.route(
"/completions",
Expand Down Expand Up @@ -181,8 +178,8 @@ fn valid_args(args: &ServeArgs) {
}
}

fn start_heartbeat(args: &ServeArgs, context: &mut TabbyContext) {
let state = HealthState::new(args, context);
fn start_heartbeat(args: &ServeArgs) {
let state = HealthState::new(args);
tokio::spawn(async move {
loop {
usage::capture("ServeHealth", &state).await;
Expand Down
Loading