Skip to content

Commit

Permalink
Added Stopped state
Browse files Browse the repository at this point in the history
  • Loading branch information
SammyVimes committed Dec 19, 2024
1 parent 842c65b commit c7f6e33
Show file tree
Hide file tree
Showing 23 changed files with 111 additions and 12 deletions.
5 changes: 3 additions & 2 deletions ydb/apps/dstool/lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import ydb.apps.dstool.lib.dstool_cmd_pdisk_remove_by_serial as pdisk_remove_by_serial
import ydb.apps.dstool.lib.dstool_cmd_pdisk_set as pdisk_set
import ydb.apps.dstool.lib.dstool_cmd_pdisk_list as pdisk_list
import ydb.apps.dstool.lib.dstool_cmd_pdisk_stop as pdisk_stop

import ydb.apps.dstool.lib.dstool_cmd_vdisk_evict as vdisk_evict
import ydb.apps.dstool.lib.dstool_cmd_vdisk_list as vdisk_list
Expand Down Expand Up @@ -48,14 +49,14 @@
pool_list, pool_create_virtual,
group_check, group_decommit, group_show_blob_info, group_show_storage_efficiency, group_show_usage_by_tablets,
group_state, group_take_snapshot, group_add, group_list, group_virtual_create, group_virtual_cancel,
pdisk_add_by_serial, pdisk_remove_by_serial, pdisk_set, pdisk_list,
pdisk_add_by_serial, pdisk_remove_by_serial, pdisk_set, pdisk_list, pdisk_stop,
vdisk_evict, vdisk_list, vdisk_set_read_only, vdisk_remove_donor, vdisk_wipe,
device_list,
]

default_structure = [
('device', ['list']),
('pdisk', ['add-by-serial', 'remove-by-serial', 'set', 'list']),
('pdisk', ['add-by-serial', 'remove-by-serial', 'set', 'list', 'stop']),
('vdisk', ['evict', 'list', 'set-read-only', 'remove-donor', 'wipe']),
('group', ['add', 'check', 'decommit', ('show', ['blob-info', 'storage-efficiency', 'usage-by-tablets']), 'state', 'take-snapshot', 'list', ('virtual', ['create', 'cancel'])]),
('pool', ['list', ('create', ['virtual'])]),
Expand Down
54 changes: 54 additions & 0 deletions ydb/apps/dstool/lib/dstool_cmd_pdisk_stop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import ydb.apps.dstool.lib.common as common
import sys

description = 'Stop PDisk'

def add_options(p):
common.add_pdisk_select_options(p)
common.add_ignore_degraded_group_check_option(p)
common.add_ignore_failure_model_group_check_option(p)
common.add_ignore_vslot_quotas_option(p)
common.add_basic_format_options(p)


def create_request(args, pdisk):
request = common.create_bsc_request(args)
cmd = request.Command.add().StopPDisk

cmd.HostKey.NodeId = pdisk[0]
cmd.PDiskId = pdisk[1]

return request


def perform_request(request):
return common.invoke_bsc_request(request)


def is_successful_response(response):
return common.is_successful_bsc_response(response)


def do(args):
base_config = common.fetch_base_config()

assert not args.dry_run, '--dry-run is not supported for this command'

pdisks = common.get_selected_pdisks(args, base_config)

if len(pdisks) != 1:
common.print_status(args, success=False, error_reason='Only stop one PDisk at a time')
sys.exit(1)

success = True
error_reason = ''

request = create_request(args, list(pdisks)[0])
response = perform_request(request)
if not is_successful_response(response):
success = False
error_reason += 'Request has failed: \n{0}\n{1}\n'.format(request, response)

common.print_status(args, success, error_reason)
if not success:
sys.exit(1)
1 change: 1 addition & 0 deletions ydb/apps/dstool/lib/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ PY_SRCS(
dstool_cmd_pdisk_list.py
dstool_cmd_pdisk_remove_by_serial.py
dstool_cmd_pdisk_set.py
dstool_cmd_pdisk_stop.py

dstool_cmd_vdisk_evict.py
dstool_cmd_vdisk_list.py
Expand Down
3 changes: 3 additions & 0 deletions ydb/core/blobstorage/pdisk/blobstorage_pdisk_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,9 @@ class TPDiskActor : public TActorBootstrapped<TPDiskActor> {
break;
case TEvYardControl::PDiskStop:
PDisk->Stop();
*PDisk->Mon.PDiskState = NKikimrBlobStorage::TPDiskState::Stopped;
*PDisk->Mon.PDiskBriefState = TPDiskMon::TPDisk::Stopped;
*PDisk->Mon.PDiskDetailedState = TPDiskMon::TPDisk::StoppedByYardControl;
InitError("Received TEvYardControl::PDiskStop");
Send(ev->Sender, new NPDisk::TEvYardControlResult(NKikimrProto::OK, evControl.Cookie, {}));
break;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/blobstorage/pdisk/blobstorage_pdisk_impl_http.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void TPDisk::RenderState(IOutputStream &str, THttpInfo &httpInfo) {
TABLED() {YELLOW_TEXT(str, briefStateStr);}
break;
case TPDiskMon::TPDisk::Error:
case TPDiskMon::TPDisk::Stopped:
TABLED() {RED_TEXT(str, stateStr);}
TABLED() {RED_TEXT(str, briefStateStr);}
break;
Expand Down
4 changes: 4 additions & 0 deletions ydb/core/blobstorage/pdisk/blobstorage_pdisk_mon.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ struct TPDiskMon {
Booting,
OK,
Error,
Stopped
};

enum EDetailedState {
Expand Down Expand Up @@ -101,6 +102,7 @@ struct TPDiskMon {
ErrorDeviceSerialMismatch,
ErrorFake,
BootingReencryptingFormat,
StoppedByYardControl,
};

static TString StateToStr(i64 val) {
Expand All @@ -112,6 +114,7 @@ struct TPDiskMon {
case Booting: return "Booting";
case OK: return "OK";
case Error: return "Error";
case Stopped: return "Stopped";
default: return "Unknown";
}
}
Expand Down Expand Up @@ -143,6 +146,7 @@ struct TPDiskMon {
case ErrorDeviceSerialMismatch: return "ErrorDeviceSerialMismatch";
case ErrorFake: return "ErrorFake";
case BootingReencryptingFormat: return "BootingReencryptingFormat";
case StoppedByYardControl: return "StoppedByYardControl";
default: return "Unknown";
}
}
Expand Down
1 change: 1 addition & 0 deletions ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Y_UNIT_TEST_SUITE(TPDiskTest) {
UNIT_ASSERT(NKikimrBlobStorage::TPDiskState::OpenFileError == 11);
UNIT_ASSERT(NKikimrBlobStorage::TPDiskState::ChunkQuotaError == 12);
UNIT_ASSERT(NKikimrBlobStorage::TPDiskState::DeviceIoError == 13);
UNIT_ASSERT(NKikimrBlobStorage::TPDiskState::Stopped == 14);
}

Y_UNIT_TEST(TestPDiskActorErrorState) {
Expand Down
6 changes: 3 additions & 3 deletions ydb/core/blobstorage/ut_blobstorage/stop_pdisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ Y_UNIT_TEST_SUITE(BSCStopPDisk) {
NKikimrBlobStorage::TConfigRequest request;

NKikimrBlobStorage::TStopPDisk* cmd = request.AddCommand()->MutableStopPDisk();
auto pdiskId = cmd->MutableTargetPDiskId();
pdiskId->SetNodeId(targetNodeId);
pdiskId->SetPDiskId(targetPDiskId);
auto* hostKey = cmd->MutableHostKey();
hostKey->SetNodeId(targetNodeId);
cmd->SetPDiskId(targetPDiskId);

auto response = env.Invoke(request);

Expand Down
3 changes: 2 additions & 1 deletion ydb/core/cms/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,11 @@ struct TCmsSentinelConfig {
stateLimits[NKikimrBlobStorage::TPDiskState::OpenFileError] = 60;
stateLimits[NKikimrBlobStorage::TPDiskState::ChunkQuotaError] = 60;
stateLimits[NKikimrBlobStorage::TPDiskState::DeviceIoError] = 60;
stateLimits[NKikimrBlobStorage::TPDiskState::Stopped] = 60;

stateLimits[NKikimrBlobStorage::TPDiskState::Reserved14] = 0;
stateLimits[NKikimrBlobStorage::TPDiskState::Reserved15] = 0;
stateLimits[NKikimrBlobStorage::TPDiskState::Reserved16] = 0;
stateLimits[NKikimrBlobStorage::TPDiskState::Reserved17] = 0;
// node online, pdisk missing
stateLimits[NKikimrBlobStorage::TPDiskState::Missing] = 60;
// node timeout
Expand Down
4 changes: 3 additions & 1 deletion ydb/core/cms/sentinel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ bool IsGoodState(EPDiskState state) {
case NKikimrBlobStorage::TPDiskState::OpenFileError:
case NKikimrBlobStorage::TPDiskState::ChunkQuotaError:
case NKikimrBlobStorage::TPDiskState::DeviceIoError:
case NKikimrBlobStorage::TPDiskState::Reserved14:
case NKikimrBlobStorage::TPDiskState::Stopped:
case NKikimrBlobStorage::TPDiskState::Reserved15:
case NKikimrBlobStorage::TPDiskState::Reserved16:
case NKikimrBlobStorage::TPDiskState::Reserved17:
case NKikimrBlobStorage::TPDiskState::Missing:
case NKikimrBlobStorage::TPDiskState::Timeout:
case NKikimrBlobStorage::TPDiskState::NodeDisconnected:
Expand Down Expand Up @@ -588,6 +589,7 @@ class TStateUpdater: public TUpdaterBase<TEvSentinel::TEvStateUpdated, TStateUpd
case NKikimrBlobStorage::TPDiskState::OpenFileError:
case NKikimrBlobStorage::TPDiskState::ChunkQuotaError:
case NKikimrBlobStorage::TPDiskState::DeviceIoError:
case NKikimrBlobStorage::TPDiskState::Stopped:
return state;
default:
LOG_C("Unknown pdisk state: " << (ui32)state);
Expand Down
1 change: 1 addition & 0 deletions ydb/core/cms/sentinel_ut_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ static constexpr NCms::EPDiskState ErrorStates[] = {
NKikimrBlobStorage::TPDiskState::OpenFileError,
NKikimrBlobStorage::TPDiskState::ChunkQuotaError,
NKikimrBlobStorage::TPDiskState::DeviceIoError,
NKikimrBlobStorage::TPDiskState::Stopped,
};

constexpr NCms::EPDiskState FaultyStates[] = {
Expand Down
1 change: 1 addition & 0 deletions ydb/core/cms/ui/sentinel_state.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ var TPDiskState = [
"OpenFileError",
"ChunkQuotaError",
"DeviceIoError",
"Stopped",
];

TPDiskState[252] = "Missing";
Expand Down
3 changes: 2 additions & 1 deletion ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2192,14 +2192,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
case NKikimrBlobStorage::TPDiskState::Missing:
case NKikimrBlobStorage::TPDiskState::Timeout:
case NKikimrBlobStorage::TPDiskState::NodeDisconnected:
case NKikimrBlobStorage::TPDiskState::Stopped:
case NKikimrBlobStorage::TPDiskState::Unknown:
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
TStringBuilder() << "PDisk state is " << NKikimrBlobStorage::TPDiskState::E_Name(pDiskInfo.GetState()),
ETags::PDiskState);
break;
case NKikimrBlobStorage::TPDiskState::Reserved14:
case NKikimrBlobStorage::TPDiskState::Reserved15:
case NKikimrBlobStorage::TPDiskState::Reserved16:
case NKikimrBlobStorage::TPDiskState::Reserved17:
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Unknown PDisk state");
break;
}
Expand Down
20 changes: 18 additions & 2 deletions ydb/core/mind/bscontroller/cmds_box.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,25 @@ namespace NKikimr::NBsController {
}

void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TStopPDisk& cmd, TStatus& /*status*/) {
auto targetPDiskId = cmd.GetTargetPDiskId();
const auto& host = NormalizeHostKey(cmd.GetHostKey());

TPDiskId pdiskId(targetPDiskId.GetNodeId(), targetPDiskId.GetPDiskId());
TPDiskId pdiskId;
if (cmd.GetPDiskId()) {
if (cmd.GetPath()) {
throw TExError() << "TUpdateDriveStatus.Path and PDiskId are mutually exclusive";
}
pdiskId = TPDiskId(host.GetNodeId(), cmd.GetPDiskId());
if (!PDisks.Find(pdiskId) || PDisksToRemove.count(pdiskId)) {
throw TExPDiskNotFound(host, cmd.GetPDiskId(), TString());
}
} else {
const std::optional<TPDiskId> found = FindPDiskByLocation(host.GetNodeId(), cmd.GetPath());
if (found && !PDisksToRemove.count(*found)) {
pdiskId = *found;
} else {
throw TExPDiskNotFound(host, 0, cmd.GetPath());
}
}

TPDiskInfo *pdisk = PDisks.FindForUpdate(pdiskId);

Expand Down
4 changes: 3 additions & 1 deletion ydb/core/protos/blobstorage_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,9 @@ message TSetPDiskReadOnly {
}

message TStopPDisk {
NKikimrBlobStorage.TPDiskId TargetPDiskId = 1;
THostKey HostKey = 1; // host on which we are looking for the drive
string Path = 2; // absolute path to the device as enlisted in PDisk configuration
uint32 PDiskId = 3; // may be set instead of path to identify PDisk
}

message TSetScrubPeriodicity {
Expand Down
3 changes: 2 additions & 1 deletion ydb/core/protos/blobstorage_disk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ message TPDiskState {
OpenFileError = 11;
ChunkQuotaError = 12;
DeviceIoError = 13;
Reserved14 = 14;
Stopped = 14;
Reserved15 = 15;
Reserved16 = 16;
Reserved17 = 17;

Missing = 252;
Timeout = 253;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/tablet/node_whiteboard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,7 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Red);
break;
case NKikimrBlobStorage::TPDiskState::OpenFileError:
case NKikimrBlobStorage::TPDiskState::Stopped:
pDiskFlag = std::max(pDiskFlag, NKikimrWhiteboard::EFlag::Yellow);
++yellowFlags;
break;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/viewer/content/v2/pdisk.js
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ PDisk.prototype.updatePDiskInfo = function(update) {
case 'OpenFileError':
case 'ChunkQuotaError':
case 'DeviceIoError':
case 'Stopped':
pDisk.css('background-color', red);
this.color = red;
break;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/viewer/content/viewer.js
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,7 @@ function onPDiskInfo(pDisksInfo) {
case 'InitialCommonLogParseError':
case 'CommonLoggerInitError':
case 'OpenFileError':
case 'Stopped':
pDiskBlock.style.backgroundColor = red;
pDisk.Color = red;
state = pDisk.State;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/viewer/viewer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,7 @@ NKikimrViewer::EFlag GetPDiskStateFlag(const NKikimrWhiteboard::TPDiskStateInfo&
case NKikimrBlobStorage::TPDiskState::InitialCommonLogParseError:
case NKikimrBlobStorage::TPDiskState::CommonLoggerInitError:
case NKikimrBlobStorage::TPDiskState::OpenFileError:
case NKikimrBlobStorage::TPDiskState::Stopped:
flag = NKikimrViewer::EFlag::Red;
break;
default:
Expand Down
1 change: 1 addition & 0 deletions ydb/docs/en/core/maintenance/manual/selfheal.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ You can use the following settings:
| **Missing** | The node responds, but this PDisk is missing from its list. Transition to `FAULTY`. |
| **Timeout** | The node didn't respond within the specified timeout. Transition to `FAULTY`. |
| **NodeDisconnected** | The node has disconnected. Transition to `FAULTY`. |
| **Stopped** | PDisk was stopped. Transition to `FAULTY`. |
| **Unknown** | Unexpected response, for example, `TEvUndelivered` to the state request. Transition to `FAULTY`. |

## Working with donor disks {#disks}
Expand Down
1 change: 1 addition & 0 deletions ydb/docs/ru/core/maintenance/manual/selfheal.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ ydb-dstool -e <bs_endpoint> cluster set --disable-self-heal
| **Missing** | Нода отвечает, но в её списке нет данного PDisk. Переход в `FAULTY`. |
| **Timeout** | Нода не ответила за отведенный таймаут. Переход в `FAULTY`. |
| **NodeDisconnected** | Отключение ноды. Переход в `FAULTY`. |
| **Stopped** | PDisk остановлен. Переход в `FAULTY`. |
| **Unknown** | Неожиданный ответ, например, ответ `TEvUndelivered` на запрос состояния. Переход в `FAULTY`. |

## Работа с дисками-донорами {#disks}
Expand Down
3 changes: 3 additions & 0 deletions ydb/tests/library/harness/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class PDiskState(enum.Enum):
CommonLoggerInitError = _pdisk_state(9, is_valid_state=False)
Normal = _pdisk_state(10, is_valid_state=True)
OpenFileError = _pdisk_state(11, is_valid_state=False)
ChunkQuotaError = _pdisk_state(12, is_valid_state=False)
DeviceIoError = _pdisk_state(13, is_valid_state=False)
Stopped = _pdisk_state(14, is_valid_state=False)

def __init__(self, id_, is_valid_state):
self.__id = id_
Expand Down

0 comments on commit c7f6e33

Please sign in to comment.