Skip to content

Commit

Permalink
HDDS-12010. Block ozone repair if service is running (apache#7758)
Browse files Browse the repository at this point in the history
  • Loading branch information
adoroszlai authored Jan 30, 2025
1 parent fc89ba6 commit 34792ed
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.ozone.repair;

import jakarta.annotation.Nullable;
import org.apache.hadoop.hdds.cli.AbstractSubcommand;
import picocli.CommandLine;

Expand Down Expand Up @@ -45,35 +46,57 @@ public abstract class RepairTool extends AbstractSubcommand implements Callable<
/** Hook method for subclasses for performing actual repair task. */
protected abstract void execute() throws Exception;

/** Which Ozone component should be verified to be offline. */
@Nullable
protected Component serviceToBeOffline() {
return null;
}

@Override
public final Void call() throws Exception {
if (!dryRun) {
confirmUser();
}
execute();
if (isServiceStateOK()) {
execute();
}
return null;
}

protected boolean checkIfServiceIsRunning(String serviceName) {
String runningEnvVar = String.format("OZONE_%s_RUNNING", serviceName);
String pidEnvVar = String.format("OZONE_%s_PID", serviceName);
String isServiceRunning = System.getenv(runningEnvVar);
String servicePid = System.getenv(pidEnvVar);
if ("true".equals(isServiceRunning)) {
if (!force) {
error("Error: %s is currently running on this host with PID %s. " +
"Stop the service before running the repair tool.", serviceName, servicePid);
return true;
} else {
info("Warning: --force flag used. Proceeding despite %s being detected as running with PID %s.",
serviceName, servicePid);
}
} else {
info("No running %s service detected. Proceeding with repair.", serviceName);
private boolean isServiceStateOK() {
final Component service = serviceToBeOffline();

if (service == null) {
return true; // online tool
}

if (!isServiceRunning(service)) {
info("No running %s service detected. Proceeding with repair.", service);
return true;
}

String servicePid = getServicePid(service);

if (force) {
info("Warning: --force flag used. Proceeding despite %s being detected as running with PID %s.",
service, servicePid);
return true;
}

error("Error: %s is currently running on this host with PID %s. " +
"Stop the service before running the repair tool.", service, servicePid);

return false;
}

private static String getServicePid(Component service) {
return System.getenv(String.format("OZONE_%s_PID", service));
}

private static boolean isServiceRunning(Component service) {
return "true".equals(System.getenv(String.format("OZONE_%s_RUNNING", service)));
}

protected boolean isDryRun() {
return dryRun;
}
Expand Down Expand Up @@ -117,4 +140,11 @@ private String getConsoleReadLineWithFormat(String currentUser) {
.nextLine()
.trim();
}

/** Ozone component for offline tools. */
protected enum Component {
DATANODE,
OM,
SCM,
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
*/
package org.apache.hadoop.ozone.repair;

import jakarta.annotation.Nonnull;
import org.apache.hadoop.hdds.cli.HddsVersionProvider;
import org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition;
import org.apache.hadoop.hdds.utils.IOUtils;
Expand Down Expand Up @@ -69,16 +70,12 @@ public class TransactionInfoRepair extends RepairTool {

@Override
public void execute() throws Exception {
final Component component = getComponent();
if (checkIfServiceIsRunning(component.name())) {
return;
}
List<ColumnFamilyHandle> cfHandleList = new ArrayList<>();
List<ColumnFamilyDescriptor> cfDescList = RocksDBUtils.getColumnFamilyDescriptors(
dbPath);
String columnFamilyName = getColumnFamily().getName();

try (ManagedRocksDB db = ManagedRocksDB.open(dbPath, cfDescList, cfHandleList)) {
String columnFamilyName = component.columnFamilyDefinition.getName();
ColumnFamilyHandle transactionInfoCfh = RocksDBUtils.getColumnFamilyHandle(columnFamilyName, cfHandleList);
if (transactionInfoCfh == null) {
throw new IllegalArgumentException(columnFamilyName +
Expand Down Expand Up @@ -111,7 +108,9 @@ public void execute() throws Exception {
}
}

private Component getComponent() {
@Override
@Nonnull
protected Component serviceToBeOffline() {
final String parent = spec().parent().name();
switch (parent) {
case "om":
Expand All @@ -123,14 +122,15 @@ private Component getComponent() {
}
}

private enum Component {
OM(OMDBDefinition.TRANSACTION_INFO_TABLE),
SCM(SCMDBDefinition.TRANSACTIONINFO);

private final DBColumnFamilyDefinition<String, TransactionInfo> columnFamilyDefinition;

Component(DBColumnFamilyDefinition<String, TransactionInfo> columnFamilyDefinition) {
this.columnFamilyDefinition = columnFamilyDefinition;
private DBColumnFamilyDefinition<String, TransactionInfo> getColumnFamily() {
Component component = serviceToBeOffline();
switch (component) {
case OM:
return OMDBDefinition.TRANSACTION_INFO_TABLE;
case SCM:
return SCMDBDefinition.TRANSACTIONINFO;
default:
throw new IllegalStateException("This tool does not support component: " + component);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.ozone.repair.om;

import jakarta.annotation.Nonnull;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
Expand Down Expand Up @@ -97,11 +98,14 @@ public class FSORepairTool extends RepairTool {
description = "Verbose output. Show all intermediate steps.")
private boolean verbose;

@Nonnull
@Override
protected Component serviceToBeOffline() {
return Component.OM;
}

@Override
public void execute() throws Exception {
if (checkIfServiceIsRunning("OM")) {
return;
}
try {
Impl repairTool = new Impl();
repairTool.run();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.hadoop.ozone.repair.om;

import jakarta.annotation.Nonnull;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.hadoop.hdds.utils.db.StringCodec;
import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
Expand Down Expand Up @@ -76,11 +77,14 @@ public class SnapshotChainRepair extends RepairTool {
description = "Path previous snapshotId to set for the given snapshot")
private UUID pathPreviousSnapshotId;

@Nonnull
@Override
protected Component serviceToBeOffline() {
return Component.OM;
}

@Override
public void execute() throws Exception {
if (checkIfServiceIsRunning("OM")) {
return;
}
List<ColumnFamilyHandle> cfHandleList = new ArrayList<>();
List<ColumnFamilyDescriptor> cfDescList = RocksDBUtils.getColumnFamilyDescriptors(dbPath);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.ozone.repair.scm.cert;

import jakarta.annotation.Nonnull;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.security.SecurityConfig;
import org.apache.hadoop.hdds.security.x509.certificate.authority.CAType;
Expand Down Expand Up @@ -71,11 +72,14 @@ public class RecoverSCMCertificate extends RepairTool {
description = "SCM DB Path")
private String dbPath;

@Nonnull
@Override
protected Component serviceToBeOffline() {
return Component.SCM;
}

@Override
public void execute() throws Exception {
if (checkIfServiceIsRunning("SCM")) {
return;
}
dbPath = removeTrailingSlashIfNeeded(dbPath);
String tableName = VALID_SCM_CERTS.getName();
DBDefinition dbDefinition =
Expand Down

0 comments on commit 34792ed

Please sign in to comment.