Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add deadlock healthcheck #3483

Merged
merged 6 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions core/src/main/java/org/mapfish/print/config/WorkingDirectories.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import javax.servlet.ServletContext;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
Expand All @@ -27,14 +28,17 @@ public class WorkingDirectories {

@Autowired private ServletContext servletContext;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

public final File getWorking() {
return this.working;
}

/**
* Defines what is the root directory used to store every temporary files.
*
* <p>The given path can contain the pattern "{WEBAPP}" and it will replaced by the webapp name.
* <p>The given path can contain the pattern "{WEBAPP}" and it will be replaced by the webapp
* name.
*
* @param working The path
*/
Expand Down Expand Up @@ -110,7 +114,7 @@ public final void removeDirectory(final File directory) {

private void createIfMissing(final File directory, final String name) {
if (!directory.exists() && !directory.mkdirs()) {
if (!directory.exists()) { // Maybe somebody else created it in the mean time
if (!directory.exists()) { // Maybe somebody else created it in the meantime
throw new AssertionError(
"Unable to create working directory: '"
+ directory
Expand All @@ -135,18 +139,8 @@ public final File getBuildFileFor(
final String extension,
final Logger logger) {
final String configurationAbsolutePath = configuration.getDirectory().getPath();
final int prefixToConfiguration = configurationAbsolutePath.length() + 1;
final String parentDir = jasperFileXml.getAbsoluteFile().getParent();
final String relativePathToFile;
if (configurationAbsolutePath.equals(parentDir)) {
relativePathToFile = FilenameUtils.getBaseName(jasperFileXml.getName());
} else {
final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration);
relativePathToFile =
relativePathToContainingDirectory
+ File.separator
+ FilenameUtils.getBaseName(jasperFileXml.getName());
}
final String relativePathToFile =
determineRelativePath(jasperFileXml, configurationAbsolutePath);

final File buildFile =
new File(getJasperCompilation(configuration), relativePathToFile + extension);
Expand All @@ -159,6 +153,22 @@ public final File getBuildFileFor(
return buildFile;
}

private String determineRelativePath(final File jasperXmlFile, final String configAbsolutePath) {
final int prefixToConfiguration = configAbsolutePath.length() + 1;
final String parentDir = jasperXmlFile.getAbsoluteFile().getParent();
final String relativePathToFile;
if (configAbsolutePath.equals(parentDir)) {
relativePathToFile = FilenameUtils.getBaseName(jasperXmlFile.getName());
} else {
final String relativePathToContainingDirectory = parentDir.substring(prefixToConfiguration);
relativePathToFile =
relativePathToContainingDirectory
+ File.separator
+ FilenameUtils.getBaseName(jasperXmlFile.getName());
}
return relativePathToFile;
}

/**
* Get the working directory for the configuration.
*
Expand Down Expand Up @@ -203,6 +213,8 @@ public void run() {
removeOldFiles(new File(System.getProperty("java.io.tmpdir")), "+~JF", this.maxAgeTaskDir);
} catch (Exception e) {
LOGGER.error("error running file clean-up task", e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "errorRunningFileCleanUpTask");
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
package org.mapfish.print.metrics;

import com.codahale.metrics.health.jvm.ThreadDeadlockHealthCheck;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Autowired;

public class HealthCheckingRegistry extends com.codahale.metrics.health.HealthCheckRegistry {
@Autowired private JobQueueHealthCheck jobQueueHealthCheck;
@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;
private final ThreadDeadlockHealthCheck threadDeadlockHealthCheck =
new ThreadDeadlockHealthCheck();

@PostConstruct
public void registerHealthCheck() {
register("jobQueueStatus", jobQueueHealthCheck);
register("unhealthyCountersStatus", unhealthyCountersHealthCheck);
register("threadDeadlock", threadDeadlockHealthCheck);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,17 @@ protected HealthCheck.Result check() throws Exception {
public void recordUnhealthyCounter(final String counterName) {
unhealthyCounters.add(counterName);
}

/**
* Records an unhealthy problem by incrementing a counter for the given class and description.
* This method helps in tracking issues which might lead to the server being marked unhealthy.
*
* @param className the name of the class where the problem occurred
* @param description a brief description of the problem
*/
public void recordUnhealthyProblem(final String className, final String description) {
String counterName = MetricRegistry.name(className, description);
metricRegistry.counter(counterName).inc();
recordUnhealthyCounter(counterName);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.mapfish.print.servlet.job;

import com.codahale.metrics.MetricRegistry;
import javax.annotation.Nonnull;
import org.hibernate.HibernateException;
import org.hibernate.Session;
Expand Down Expand Up @@ -85,11 +84,8 @@ protected void doInTransactionWithoutResult(@Nonnull final TransactionStatus sta
}
});
} catch (HibernateException ex) {
String name =
MetricRegistry.name(
getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkiped");
metricRegistry.counter(name).inc();
unhealthyCountersHealthCheck.recordUnhealthyCounter(name);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "insertRecordInHibernateFailedThenSkipped");
LOGGER.error("Cannot save accounting information", ex);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.mapfish.print.ExceptionUtils;
import org.mapfish.print.PrintException;
import org.mapfish.print.config.WorkingDirectories;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.mapfish.print.servlet.job.JobManager;
import org.mapfish.print.servlet.job.JobQueue;
import org.mapfish.print.servlet.job.NoSuchReferenceException;
Expand Down Expand Up @@ -73,7 +74,7 @@ public class ThreadPoolJobManager implements JobManager {
private int maxNumberOfWaitingJobs = DEFAULT_MAX_WAITING_JOBS;

/** The amount of time to let a thread wait before being shutdown. */
private long maxIdleTime = DEFAULT_THREAD_IDLE_TIME;
private final long maxIdleTime = DEFAULT_THREAD_IDLE_TIME;

/** A print job is canceled, if it is not completed after this amount of time (in seconds). */
private long timeout = DEFAULT_TIMEOUT_IN_SECONDS;
Expand Down Expand Up @@ -118,6 +119,8 @@ public class ThreadPoolJobManager implements JobManager {

@Autowired private MetricRegistry metricRegistry;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

private boolean requestedToStop = false;
private Date lastExecutedJobTimestamp;

Expand Down Expand Up @@ -211,7 +214,7 @@ public final void init() {
}
return 0;
});
/* The ThreadPoolExecutor uses a unbounded queue (though we are enforcing a limit in `submit()`).
/* The ThreadPoolExecutor uses an unbounded queue (though we are enforcing a limit in `submit()`).
* Because of that, the executor creates only `corePoolSize` threads. But to use all threads,
* we set both `corePoolSize` and `maximumPoolSize` to `maxNumberOfRunningPrintJobs`. As a
* consequence, the `maxIdleTime` will be ignored, idle threads will not be terminated.
Expand Down Expand Up @@ -239,11 +242,15 @@ protected void beforeExecute(final Thread t, final Runnable runnable) {
ThreadPoolJobManager.this.jobQueue.start(printJob.getEntry().getReferenceId());
} catch (RuntimeException e) {
LOGGER.error("failed to mark job as running", e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
ThreadPoolJobManager.class.getSimpleName(), "failedToMarkJobAsRunning");
} catch (NoSuchReferenceException e) {
LOGGER.error(
"tried to mark non-existing job as 'running': {}",
printJob.getEntry().getReferenceId(),
e);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
ThreadPoolJobManager.class.getSimpleName(), "couldNotMarkNonExistingJob");
}
}
}
Expand Down Expand Up @@ -573,7 +580,7 @@ public Callable<V> getCallable() {
}

/**
* This timer task changes the status of finished jobs in the registry. Also it stops jobs that
* This timer task changes the status of finished jobs in the registry. Also, it stops jobs that
* have been running for too long (timeout).
*/
@VisibleForTesting
Expand Down Expand Up @@ -628,8 +635,13 @@ public void run() {
}
} catch (javax.persistence.PessimisticLockException e) {
// Ignore error on pessimistic locking
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "ignoredPessimisticLockIssue");
LOGGER.warn("Ignored Pessimistic Lock Issue", e);
} catch (RuntimeException t) {
LOGGER.error("Error while polling/updating registry", t);
unhealthyCountersHealthCheck.recordUnhealthyProblem(
getClass().getSimpleName(), "pollingRegistryIssue");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,39 +1,48 @@
package org.mapfish.print.servlet.job.impl.hibernate;

import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import org.mapfish.print.metrics.UnhealthyCountersHealthCheck;
import org.mapfish.print.servlet.job.JobQueue;
import org.mapfish.print.servlet.job.NoSuchReferenceException;
import org.mapfish.print.servlet.job.PrintJobEntry;
import org.mapfish.print.servlet.job.PrintJobResult;
import org.mapfish.print.servlet.job.PrintJobStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.TransactionTemplate;

/** Db Job Manager. */
/** Transfers request to the Job dao. */
@Transactional
public class HibernateJobQueue implements JobQueue {
private static final int DEFAULT_TIME_TO_KEEP_AFTER_ACCESS = 30; /* minutes */

private static final long DEFAULT_CLEAN_UP_INTERVAL = 300; /* seconds */
private static final TimeUnit CLEAN_UP_INTERVAL_TIME_UNIT = TimeUnit.SECONDS;

private static final Logger LOGGER = LoggerFactory.getLogger(HibernateJobQueue.class);

@Autowired private PrintJobDao dao;

@Autowired private PlatformTransactionManager txManager;

@Autowired private MetricRegistry metricRegistry;

@Autowired private UnhealthyCountersHealthCheck unhealthyCountersHealthCheck;

private ScheduledExecutorService cleanUpTimer;

/** The interval at which old records are deleted (in seconds). */
private long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL;
private final long cleanupInterval = DEFAULT_CLEAN_UP_INTERVAL;

/** The minimum time to keep records after last access. */
private int timeToKeepAfterAccessInMinutes = DEFAULT_TIME_TO_KEEP_AFTER_ACCESS;
Expand Down Expand Up @@ -198,7 +207,7 @@ public final void init() {
return thread;
});
this.cleanUpTimer.scheduleAtFixedRate(
this::cleanup, this.cleanupInterval, this.cleanupInterval, TimeUnit.SECONDS);
this::cleanup, cleanupInterval, cleanupInterval, CLEAN_UP_INTERVAL_TIME_UNIT);
}

/** Called by spring when application context is being destroyed. */
Expand All @@ -209,11 +218,28 @@ public final void shutdown() {

private void cleanup() {
TransactionTemplate tmpl = new TransactionTemplate(this.txManager);
final int nbDeleted =
tmpl.execute(
status ->
HibernateJobQueue.this.dao.deleteOld(
System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis()));
metricRegistry.counter(getClass().getName() + ".deleted_old").inc(nbDeleted);
String name = MetricRegistry.name(getClass().getSimpleName(), "deletedOldPrintJobs");
try (Timer.Context deletion = metricRegistry.timer(name).time()) {
final Integer nbDeleted =
tmpl.execute(
status -> {
try {
return HibernateJobQueue.this.dao.deleteOld(
System.currentTimeMillis() - getTimeToKeepAfterAccessInMillis());
} catch (Exception e) {
unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "issue");
return null;
}
});
if (deletion.stop() > CLEAN_UP_INTERVAL_TIME_UNIT.toNanos(cleanupInterval)) {
unhealthyCountersHealthCheck.recordUnhealthyProblem(name, "tooManyJobsToDeleteInInterval");
LOGGER.warn("Deleting Old Print Jobs took longer than the scheduled interval.");
}
if (nbDeleted != null && nbDeleted > 0) {
// This counter counts the number of deleted old job prints by this server instance.
// But it is not the grand total of deleted jobs in the database.
metricRegistry.counter(MetricRegistry.name(name, "totalCount")).inc(nbDeleted);
}
}
}
}
Loading