Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: mgr/devicehealth: added life_expectancy_response() #61

Open
wants to merge 10 commits into
base: wip-devicehealth
Choose a base branch
from
5 changes: 0 additions & 5 deletions src/common/blkdev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,10 @@ std::string get_device_id(const std::string& devname)

udev = udev_new();
if (!udev) {
//derr << "failed to run udev_new(), when calling for device " << devname << dendl;
return {};
}
dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
if (!dev) {
//derr << "failed to run udev_device_new_from_subsystem_sysname() for " << devname << dendl;
udev_unref(udev);
return {};
}
Expand All @@ -405,15 +403,13 @@ std::string get_device_id(const std::string& devname)
udev_unref(udev);

if (!device_id.empty()) {
//dout << devname << " serial number: " << data << dendl;
std::replace(device_id.begin(), device_id.end(), ' ', '_');
return device_id;
}

// either udev_device_get_property_value() failed, or succeeded but
// returned nothing; trying to read from files. note that the 'vendor'
// file rarely contains the actual vendor; it's usually 'ATA'.
//derr << "udev could not retrieve serial number of " << devname << dendl;
std::string model, serial;
model = get_block_device_string_property_wrap(devname, "device/model");
serial = get_block_device_string_property_wrap(devname, "device/serial");
Expand All @@ -434,7 +430,6 @@ std::string get_block_device_string_property_wrap(const std::string &devname,
std::string prop_val;
int ret = get_block_device_string_property(devname.c_str(), property.c_str(), buff, sizeof(buff));
if (ret < 0) {
//derr << "Could not retrieve content of " << property << " file of " << devname << dendl;
return {};
}
prop_val = buff;
Expand Down
2 changes: 1 addition & 1 deletion src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4749,7 +4749,7 @@ std::vector<Option> get_global_options() {
.set_description("Filesystem path to manager modules."),

Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_BASIC)
.set_default("restful status balancer iostat")
.set_default("restful status balancer iostat devicehealth")
.set_flag(Option::FLAG_NO_MON_UPDATE)
.set_flag(Option::FLAG_CLUSTER_CREATE)
.add_service("mon")
Expand Down
16 changes: 12 additions & 4 deletions src/mgr/ActivePyModules.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,10 +281,10 @@ PyObject *ActivePyModules::get_python(const std::string &what)
return f.get();
} else if (what == "pg_dump") {
PyFormatter f;
cluster_state.with_pgmap(
[&f](const PGMap &pg_map) {
pg_map.dump(&f);
}
cluster_state.with_pgmap(
[&f](const PGMap &pg_map) {
pg_map.dump(&f);
}
);
return f.get();
} else if (what == "devices") {
Expand All @@ -295,6 +295,14 @@ PyObject *ActivePyModules::get_python(const std::string &what)
});
f.close_section();
return f.get();
} else if (what.size() > 7 &&
what.substr(0, 7) == "device ") {
string devid = what.substr(7);
PyFormatter f;
daemon_state.with_device(devid, [&f] (const DeviceState& dev) {
f.dump_object("device", dev);
});
return f.get();
} else if (what == "io_rate") {
PyFormatter f;
cluster_state.with_pgmap(
Expand Down
38 changes: 28 additions & 10 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2243,7 +2243,9 @@ will start to track new ops received afterwards.";
}
f->close_section();
} else if (admin_command == "smart") {
probe_smart(ss);
string devid;
cmd_getval(cct, cmdmap, "devid", devid);
probe_smart(devid, ss);
} else if (admin_command == "list_devices") {
set<string> devnames;
store->get_devices(&devnames);
Expand Down Expand Up @@ -6171,7 +6173,9 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
}

else if (prefix == "smart") {
probe_smart(ds);
string devid;
cmd_getval(cct, cmdmap, "devid", devid);
probe_smart(devid, ds);
}

else {
Expand All @@ -6192,14 +6196,15 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
}
}

void OSD::probe_smart(ostream& ss)
void OSD::probe_smart(const string& only_devid, ostream& ss)
{
set<string> devnames;
store->get_devices(&devnames);
uint64_t smart_timeout = cct->_conf->get_val<uint64_t>("osd_smart_report_timeout");
std::string result;
uint64_t smart_timeout = cct->_conf->get_val<uint64_t>(
"osd_smart_report_timeout");

json_spirit::mObject json_map; // == typedef std::map<std::string, mValue> mObject;
// == typedef std::map<std::string, mValue> mObject;
json_spirit::mObject json_map;
json_spirit::mValue smart_json;

for (auto dev : devnames) {
Expand All @@ -6208,16 +6213,28 @@ void OSD::probe_smart(ostream& ss)
continue;
}

string devid = get_device_id(dev);
if (devid.size() == 0) {
dout(10) << __func__ << " no unique id for dev " << dev << ", skipping"
<< dendl;
continue;
}
if (only_devid.size() && devid != only_devid) {
continue;
}

std::string result;
if (probe_smart_device(("/dev/" + dev).c_str(), smart_timeout, &result)) {
dout(10) << "probe_smart_device failed for /dev/" << dev << dendl;
continue;
//continue;
result = "{\"error\": \"smartctl failed\", \"dev\": \"" + dev + "\"}";
}

// TODO: change to read_or_throw?
if (!json_spirit::read(result, smart_json)) {
derr << "smartctl JSON output of /dev/" + dev + " is invalid" << dendl;
} else { //json is valid, assigning
json_map[dev] = smart_json;
json_map[devid] = smart_json;
}
// no need to result.clear() or clear smart_json
}
Expand All @@ -6227,8 +6244,9 @@ void OSD::probe_smart(ostream& ss)
int OSD::probe_smart_device(const char *device, int timeout, std::string *result)
{
// when using --json, smartctl will report its errors in JSON format to stdout
SubProcessTimed smartctl("sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
timeout);
SubProcessTimed smartctl(
"sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
timeout);
smartctl.add_cmd_args(
"smartctl",
"-a",
Expand Down
2 changes: 1 addition & 1 deletion src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -2208,7 +2208,7 @@ class OSD : public Dispatcher,

float get_osd_recovery_sleep();

void probe_smart(ostream& ss);
void probe_smart(const string& devid, ostream& ss);
int probe_smart_device(const char *device, int timeout, std::string *result);

public:
Expand Down
2 changes: 2 additions & 0 deletions src/pybind/mgr/devicehealth/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from .module import Module
Loading