Skip to content

Commit

Permalink
pool: skip internal repository check if external command provided
Browse files Browse the repository at this point in the history
Motivation:
dCache provides an possibility to specify a command that be used by pool
to perform repository check. Nonetheless, if command is specified, the
internal check still performed, which might be not optimal, for example,
then disk is set to read-only.

Modification:
Update CheckHealthTast to by-pass internal check if command is
specified. Cover various configurations with test cases.

Result:
an external case can fully cover repository testing if specified.

Acked-by: Dmitry Litvintsev
Target: master
Require-book: yes
Require-notes: yes
  • Loading branch information
kofemann committed Oct 2, 2024
1 parent 4d7e0de commit f745948
Show file tree
Hide file tree
Showing 3 changed files with 254 additions and 33 deletions.
16 changes: 14 additions & 2 deletions docs/TheBook/src/main/markdown/cookbook-pool.md
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,18 @@ The `jtm` periodically scans specified queue to discover expired movers. If requ

pool command.

## Pool health check

[admin interface]: #intouch-admin
[???]: #in-install-layout
The pool health check is a mechanism to monitor the health of a pool's repository. By default, the health check is performed
periodically by creating and deleting an empty file in the pool's repository. For more sophisticated health checks, the pool
can execute a script that is specified by `pool.check-health-command`. The return value of the script is used to determine
the health of the pool. If the exit code is 0 (zero), the pool is assumed to be okay. If the exit code is 1 (one), the
pool is marked read-only. Any other exit code, including failure to execute the script, will disable the pool.

For example, the default test can be implemented as:

```
pool.check-health-command=/bin/sh -c "touch ${pool.path}/test_file; rm ${pool.path}/test_file"
```

> NOTE: if external command is specified, then internal check is disabled.
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@

import static org.dcache.util.Exceptions.messageOrClassName;

import java.io.BufferedReader;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import com.google.common.annotations.VisibleForTesting;
import dmg.cells.nucleus.CellInfoProvider;
import org.apache.log4j.NDC;
import org.dcache.alarms.AlarmMarkerFactory;
import org.dcache.alarms.PredefinedAlarm;
Expand All @@ -20,7 +25,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class CheckHealthTask implements Runnable {
class CheckHealthTask implements Runnable, CellInfoProvider {

private static final Logger LOGGER = LoggerFactory.getLogger(CheckHealthTask.class);
public static final int GRACE_PERIOD_ON_FREE = 60_000;
Expand All @@ -43,6 +48,11 @@ class CheckHealthTask implements Runnable {
*/
private String[] _commands = {};

/**
* Last time the health check was performed.
*/
private volatile Instant _lastCheck = null;

public void setRepository(ReplicaRepository repository) {
_repository = repository;
}
Expand All @@ -59,6 +69,10 @@ public void setCommand(String s) {
_commands = new Scanner(s).scan();
}

private boolean hasExternalCheck() {
return _commands.length > 0;
}

@Override
public void run() {
switch (_repository.getState()) {
Expand All @@ -69,13 +83,19 @@ public void run() {
case CLOSED:
break;
case OPEN:
if (_replicaStore.isOk() == FileStoreState.FAILED) {
_repository.fail(FaultAction.DISABLED, "I/O test failed");
}else if (_replicaStore.isOk() == FileStoreState.READ_ONLY){
LOGGER.error(
"Read-only file system");

_repository.fail(FaultAction.READONLY, "I/O test failed, READ_ONLY Error");
_lastCheck = Instant.now();
var state = hasExternalCheck() ? checkHealthCommand() : _replicaStore.isOk();
switch (state) {
case OK:
break;
case FAILED:
_repository.fail(FaultAction.DISABLED, "I/O test failed");
break;
case READ_ONLY:
LOGGER.error("Read-only file system");
_repository.fail(FaultAction.READONLY, "I/O test failed, READ_ONLY Error");
break;
}

if (!checkSpaceAccounting()) {
Expand All @@ -86,47 +106,42 @@ public void run() {

adjustFreeSpace();
}

checkHealthCommand();
}

private void checkHealthCommand() {
if (_commands.length > 0) {
@VisibleForTesting
FileStoreState checkHealthCommand() {

NDC.push("health-check");
try {
ProcessBuilder builder = new ProcessBuilder(_commands);
builder.redirectErrorStream(true);
Process process = builder.start();
try {
StringBuilder output = new StringBuilder();
String output;
try (InputStream in = process.getInputStream()) {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line = reader.readLine();
while (line != null) {
output.append(line).append('\n');
line = reader.readLine();
}
output = new String(in.readAllBytes(), StandardCharsets.UTF_8);
}
int code = process.waitFor();
switch (code) {
case 0:
return switch (code) {
case 0 -> {
if (output.length() > 0) {
LOGGER.debug("{}", output);
}
break;
case 1:
_repository.fail(FaultAction.READONLY,
"Health check command failed with exit code 1");
yield FileStoreState.OK;
}
case 1 -> {
if (output.length() > 0) {
LOGGER.warn("{}", output);
}
default:
_repository.fail(FaultAction.DISABLED,
"Health check command failed with exit code " + code);
yield FileStoreState.READ_ONLY;
}
default -> {
if (output.length() > 0) {
LOGGER.warn("{}", output);
}
}
yield FileStoreState.FAILED;
}
};
} catch (InterruptedException e) {
LOGGER.debug("Health check command was interrupted");
process.destroy();
Expand All @@ -137,7 +152,9 @@ private void checkHealthCommand() {
} finally {
NDC.pop();
}
}

// If we reach here, then the health check failed.
return FileStoreState.FAILED;
}

private boolean checkSpaceAccounting() {
Expand Down Expand Up @@ -389,4 +406,9 @@ private void scanEscapedCharacter(StringBuilder word) {
}
}

@Override
public void getInfo(PrintWriter pw) {
pw.println(" Check command : " + (hasExternalCheck() ? Arrays.toString(_commands) : " -- internal --"));
pw.println(" Last check at : " + (_lastCheck == null ? "N/A" : _lastCheck));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,19 @@

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;

import org.dcache.pool.FaultAction;
import org.dcache.pool.repository.Account;
import org.dcache.pool.repository.FileStoreState;
import org.dcache.pool.repository.ReplicaStore;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
import org.mockito.Mockito;

public class CheckHealthTaskTest {

Expand Down Expand Up @@ -81,4 +92,180 @@ public void testEscapedBackslash() {
String[] command = new CheckHealthTask.Scanner("bar\\\\bar").scan();
assertArrayEquals(new String[]{"bar\\bar"}, command);
}

@Test
public void testNoRepositoryCheckOnCommand() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();
when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);

var check = spy(new CheckHealthTask());
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);
check.setCommand("/bin/true");

check.run();

verify(replicaStore, never()).isOk();
verify(check).checkHealthCommand();
}

@Test
public void testRepositoryCheckNoCommand() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();
when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);
when(replicaStore.isOk()).thenReturn(FileStoreState.OK);

var check = spy(new CheckHealthTask());
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);

check.run();

verify(replicaStore).isOk();
verify(check, never()).checkHealthCommand();
}


@Test
public void testSetReadOnlyOnCommandExit_1() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();

when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);
var storeState = ArgumentCaptor.forClass(FaultAction.class);


var check = new CheckHealthTask();
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);
check.setCommand("/bin/sh -c 'exit 1'");

check.run();

verify(repository).fail(storeState.capture(), any());
assertEquals(FaultAction.READONLY, storeState.getValue());
}

@Test
public void testDontFailOnCommandExit_0() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();

when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);

var check = new CheckHealthTask();
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);
check.setCommand("/bin/sh -c 'exit 0'");

check.run();

verify(repository, never()).fail(any(), any());
}

@Test
public void testDisableOnCommandExit_any() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();

when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);
var storeState = ArgumentCaptor.forClass(FaultAction.class);


var check = new CheckHealthTask();
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);
check.setCommand("/bin/sh -c 'exit 2'");

check.run();

verify(repository).fail(storeState.capture(), any());
assertEquals(FaultAction.DISABLED, storeState.getValue());
}



@Test
public void testSetReadonlyOnCheckRO() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();
when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);
when(replicaStore.isOk()).thenReturn(FileStoreState.READ_ONLY);

var storeState = ArgumentCaptor.forClass(FaultAction.class);

var check = spy(new CheckHealthTask());
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);

check.run();

verify(repository).fail(storeState.capture(), any());
assertEquals(FaultAction.READONLY, storeState.getValue());
}

@Test
public void testSetDisableOnFailed() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();
when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);
when(replicaStore.isOk()).thenReturn(FileStoreState.FAILED);

var storeState = ArgumentCaptor.forClass(FaultAction.class);

var check = spy(new CheckHealthTask());
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);

check.run();

verify(repository).fail(storeState.capture(), any());
assertEquals(FaultAction.DISABLED, storeState.getValue());
}

@Test
public void testSetDisableOnTestFailure() {

var repository = Mockito.mock(ReplicaRepository.class);
var replicaStore = Mockito.mock(ReplicaStore.class);
var account = new Account();
when(repository.getState()).thenReturn(ReplicaRepository.State.OPEN);

var storeState = ArgumentCaptor.forClass(FaultAction.class);

var check = spy(new CheckHealthTask());
check.setAccount(account);
check.setReplicaStore(replicaStore);
check.setRepository(repository);
check.setCommand("/non/existing/command");

check.run();

verify(repository).fail(storeState.capture(), any());
assertEquals(FaultAction.DISABLED, storeState.getValue());
}

}

0 comments on commit f745948

Please sign in to comment.