From 0d0778228dcf3b4003097de61ad2c66f7080cd95 Mon Sep 17 00:00:00 2001 From: John Gates Date: Thu, 1 Aug 2024 16:34:35 -0700 Subject: [PATCH 01/22] Removed QueryRequest and XrdSsiMocks. --- src/ccontrol/UserQuerySelect.cc | 25 +- src/czar/Czar.cc | 5 - src/qdisp/CMakeLists.txt | 2 - src/qdisp/Executive.cc | 75 +----- src/qdisp/Executive.h | 30 +-- src/qdisp/JobBase.h | 3 +- src/qdisp/JobQuery.cc | 63 +---- src/qdisp/JobQuery.h | 16 -- src/qdisp/QueryRequest.cc | 433 -------------------------------- src/qdisp/QueryRequest.h | 165 ------------ src/qdisp/UberJob.cc | 2 - src/qdisp/UberJob.h | 4 - src/qdisp/XrdSsiMocks.cc | 312 ----------------------- src/qdisp/XrdSsiMocks.h | 72 ------ src/qdisp/testQDisp.cc | 36 +-- src/wdb/QueryRunner.cc | 2 + src/xrdsvc/ChannelStream.h | 2 +- 17 files changed, 46 insertions(+), 1201 deletions(-) delete mode 100644 src/qdisp/QueryRequest.cc delete mode 100644 src/qdisp/QueryRequest.h delete mode 100644 src/qdisp/XrdSsiMocks.cc delete mode 100644 src/qdisp/XrdSsiMocks.h diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index f96a293cc..7ca22b295 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -306,16 +306,6 @@ void UserQuerySelect::submit() { std::make_shared(_infileMerger, chunkResultName), taskMsgFactory, cs, chunkResultName); auto job = _executive->add(jobDesc); - - if (!uberJobsEnabled) { - // references in captures cause races - auto funcBuildJob = [this, job{move(job)}](util::CmdData*) { - QSERV_LOGCONTEXT_QUERY(_qMetaQueryId); - _executive->runJobQuery(job); - }; - auto cmd = std::make_shared(funcBuildJob); - _executive->queueJobStart(cmd); - } ++sequence; } @@ -358,12 +348,16 @@ void UserQuerySelect::buildAndSendUberJobs() { // Only one thread should be generating UberJobs for this user query at any given time. lock_guard fcLock(_buildUberJobMtx); - bool const clearFlag = false; - _executive->setFlagFailedUberJob(clearFlag); LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << _executive->getTotalJobs()); vector uberJobs; + qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = _executive->unassignedChunksInQuery(); + if (unassignedChunksInQuery.empty()) { + LOGS(_log, LOG_LVL_TRACE, funcN << " no unassigned Jobs"); + return; + } + auto czarPtr = czar::Czar::getCzar(); auto czFamilyMap = czarPtr->getCzarFamilyMap(); auto czChunkMap = czFamilyMap->getChunkMap(_queryDbName); @@ -378,9 +372,9 @@ void UserQuerySelect::buildAndSendUberJobs() { auto const [chunkMapPtr, workerChunkMapPtr] = czChunkMap->getMaps(); // Make a map of all jobs in the executive. - // TODO:UJ Maybe a check should be made that all datbases are in the same family? + // TODO:UJ Maybe a check should be made that all databases are in the same family? + - qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = _executive->unassignedChunksInQuery(); // keep cycling through workers until no more chunks to place. // - create a map of UberJobs key=, val=> @@ -468,9 +462,6 @@ void UserQuerySelect::buildAndSendUberJobs() { } errStr += " they will be retried later."; LOGS(_log, LOG_LVL_ERROR, errStr); - // There are likely to be unassigned jobs, so set a flag to try to make - // new uber jobs for these jobs. - _executive->setFlagFailedUberJob(true); } // Add worker contact info to UberJobs. diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index bc73e2eca..0c949c2c8 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -67,14 +67,11 @@ #include "util/IterableFormatter.h" #include "util/String.h" #include "xrdreq/QueryManagementAction.h" -#include "XrdSsi/XrdSsiProvider.hh" using namespace lsst::qserv; using namespace nlohmann; using namespace std; -extern XrdSsiProvider* XrdSsiProviderClient; - namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.Czar"); @@ -208,10 +205,8 @@ Czar::Czar(string const& configFilePath, string const& czarName) int xrootdCBThreadsInit = _czarConfig->getXrootdCBThreadsInit(); LOGS(_log, LOG_LVL_INFO, "config xrootdCBThreadsMax=" << xrootdCBThreadsMax); LOGS(_log, LOG_LVL_INFO, "config xrootdCBThreadsInit=" << xrootdCBThreadsInit); - XrdSsiProviderClient->SetCBThreads(xrootdCBThreadsMax, xrootdCBThreadsInit); int const xrootdSpread = _czarConfig->getXrootdSpread(); LOGS(_log, LOG_LVL_INFO, "config xrootdSpread=" << xrootdSpread); - XrdSsiProviderClient->SetSpread(xrootdSpread); _queryDistributionTestVer = _czarConfig->getQueryDistributionTestVer(); LOGS(_log, LOG_LVL_INFO, "Creating czar instance with name " << czarName); diff --git a/src/qdisp/CMakeLists.txt b/src/qdisp/CMakeLists.txt index e0aa44667..38daf54c1 100644 --- a/src/qdisp/CMakeLists.txt +++ b/src/qdisp/CMakeLists.txt @@ -9,9 +9,7 @@ target_sources(qdisp PRIVATE JobDescription.cc JobQuery.cc QdispPool.cc - QueryRequest.cc UberJob.cc - XrdSsiMocks.cc ) target_include_directories(qdisp PRIVATE diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 75fd0914b..0b66764d4 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -48,10 +48,6 @@ // Third-party headers #include "boost/format.hpp" -#include "XrdSsi/XrdSsiErrInfo.hh" -#include "XrdSsi/XrdSsiProvider.hh" -#include "XrdSsi/XrdSsiResource.hh" -#include "XrdSsi/XrdSsiService.hh" // LSST headers #include "lsst/log/Log.h" @@ -62,13 +58,12 @@ #include "ccontrol/msgCode.h" #include "ccontrol/TmpTableName.h" #include "ccontrol/UserQuerySelect.h" +#include "czar/Czar.h" #include "global/LogContext.h" #include "global/ResourceUnit.h" #include "qdisp/CzarStats.h" #include "qdisp/JobQuery.h" -#include "qdisp/QueryRequest.h" #include "qdisp/ResponseHandler.h" -#include "qdisp/XrdSsiMocks.h" #include "query/QueryContext.h" #include "qproc/QuerySession.h" #include "qmeta/Exceptions.h" @@ -82,20 +77,10 @@ using namespace std; -extern XrdSsiProvider* XrdSsiProviderClient; - namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.Executive"); -string getErrorText(XrdSsiErrInfo& e) { - ostringstream os; - int errCode; - os << "XrdSsiError " << e.Get(errCode); - os << " Code=" << errCode; - return os.str(); -} - } // anonymous namespace namespace lsst::qserv::qdisp { @@ -112,7 +97,7 @@ Executive::Executive(ExecutiveConfig const& c, shared_ptr c _qMeta(qStatus), _querySession(querySession) { _secondsBetweenQMetaUpdates = chrono::seconds(_config.secondsBetweenChunkUpdates); - _setup(); + //&&&_setup(); _setupLimit(); qdisp::CzarStats::get()->addQuery(); } @@ -125,8 +110,6 @@ Executive::~Executive() { if (czar::Czar::getCzar()->getExecutiveFromMap(getId()) != nullptr) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) + " pointer in map should be invalid QID=" << getId()); } - // Real XrdSsiService objects are unowned, but mocks are allocated in _setup. - delete dynamic_cast(_xrdSsiService); if (_asyncTimer != nullptr) { _asyncTimer->cancel(); qdisp::CzarStats::get()->untrackQueryProgress(_id); @@ -244,13 +227,6 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { return jobQuery; } -void Executive::runJobQuery(JobQuery::Ptr const& jobQuery) { - bool started = jobQuery->runJob(); - if (!started && isLimitRowComplete()) { - markCompleted(jobQuery->getJobId(), false); - } -} - void Executive::queueJobStart(PriorityCommand::Ptr const& cmd) { _jobStartCmdList.push_back(cmd); if (_scanInteractive) { @@ -299,35 +275,6 @@ void Executive::waitForAllJobsToStart() { LOGS(_log, LOG_LVL_INFO, "waitForAllJobsToStart done"); } -// If the executive has not been cancelled, then we simply start the query. -// @return true if query was actually started (i.e. we were not cancelled) -// // TODO:UJ delete this function -bool Executive::startQuery(shared_ptr const& jobQuery) { - lock_guard lock(_cancelled.getMutex()); - - // If this has been cancelled, then return false. - if (_cancelled) return false; - - // Construct a temporary resource object to pass to ProcessRequest(). - // Interactive Queries should have an Affinity of XrdSsiResource::None or Weak while - // Scans should have an affinity of Strong - XrdSsiResource::Affinity affinity = (_scanInteractive) ? XrdSsiResource::Weak : XrdSsiResource::Strong; - XrdSsiResource jobResource(jobQuery->getDescription()->resource().path(), "", jobQuery->getIdStr(), "", 0, - affinity); - - // Now construct the actual query request and tie it to the jobQuery. The - // shared pointer is used by QueryRequest to keep itself alive, sloppy design. - // Note that JobQuery calls StartQuery that then calls JobQuery, yech! - // - QueryRequest::Ptr qr = QueryRequest::create(jobQuery); - jobQuery->setQueryRequest(qr); - - // Start the query. The rest is magically done in the background. - // - getXrdSsiService()->ProcessRequest(*(qr.get()), jobResource); - return true; -} - Executive::ChunkIdJobMapType Executive::unassignedChunksInQuery() { lock_guard lck(_chunkToJobMapMtx); @@ -577,26 +524,12 @@ string Executive::getProgressDesc() const { return msg_progress; } +/* &&& void Executive::_setup() { - XrdSsiErrInfo eInfo; _empty.store(true); _requestCount = 0; - // If unit testing, load the mock service. - if (_config.serviceUrl.compare(_config.getMockStr()) == 0) { - _xrdSsiService = new XrdSsiServiceMock(this); - } else { - static XrdSsiService* xrdSsiServiceStatic = - XrdSsiProviderClient->GetService(eInfo, _config.serviceUrl); - _xrdSsiService = xrdSsiServiceStatic; - } - if (!_xrdSsiService) { - LOGS(_log, LOG_LVL_DEBUG, - _id << " Error obtaining XrdSsiService in Executive: " - "serviceUrl=" - << _config.serviceUrl << " " << getErrorText(eInfo)); - } - assert(_xrdSsiService); } +*/ /** Add (jobId,r) entry to _requesters map if not here yet * else leave _requesters untouched. diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 1d95e5a9c..33ff15de4 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -52,10 +52,10 @@ #include "util/ThreadPool.h" // TODO:UJ replace with better enable/disable feature, or just use only UberJobs -#define uberJobsEnabled 1 +#define uberJobsEnabled 1 // &&& delete -// Forward declarations -class XrdSsiService; +//&&& // Forward declarations +//&&&class XrdSsiService; namespace lsst::qserv { @@ -132,9 +132,6 @@ class Executive : public std::enable_shared_from_this { /// Add an item with a reference number std::shared_ptr add(JobDescription::Ptr const& s); - /// TODO:UJ - to be deleted - void runJobQuery(std::shared_ptr const& jobQuery); - // Queue `uberJob` to be run using the QDispPool. void runUberJob(std::shared_ptr const& uberJob); @@ -179,12 +176,10 @@ class Executive : public std::enable_shared_from_this { /// @return true if cancelled bool getCancelled() { return _cancelled; } - XrdSsiService* getXrdSsiService() { return _xrdSsiService; } + //&&&XrdSsiService* getXrdSsiService() { return _xrdSsiService; } std::shared_ptr getQdispPool() { return _qdispPool; } - bool startQuery(std::shared_ptr const& jobQuery); // TODO:UJ delete - /// Add 'rowCount' to the total number of rows in the result table. void addResultRows(int64_t rowCount); @@ -214,13 +209,6 @@ class Executive : public std::enable_shared_from_this { int getTotalJobs() { return _totalJobs; } - /// Set `_failedUberJob` to `val`; Setting this to true is a flag - /// that indicates to the Czar::_monitor that this Executive - /// probably has unassigned jobs that need to be placed in - /// new UberJobs. This `val` should only be set false by - /// Czar::_monitor(). - void setFlagFailedUberJob(bool val) { _failedUberJob = val; } - /// Add an error code and message that may be displayed to the user. void addMultiError(int errorCode, std::string const& errorMsg, int errState); @@ -245,7 +233,7 @@ class Executive : public std::enable_shared_from_this { SharedResources::Ptr const& sharedResources, std::shared_ptr const& qStatus, std::shared_ptr const& querySession); - void _setup(); + //&&&void _setup(); void _setupLimit(); bool _track(int refNum, std::shared_ptr const& r); @@ -273,9 +261,11 @@ class Executive : public std::enable_shared_from_this { std::atomic _empty{true}; std::shared_ptr _messageStore; ///< MessageStore for logging + /* &&& /// RPC interface, static to avoid getting every time a user query starts and separate /// from _xrdSsiService to avoid conflicts with XrdSsiServiceMock. XrdSsiService* _xrdSsiService; ///< RPC interface + */ JobMap _jobMap; ///< Contains information about all jobs. JobMap _incompleteJobs; ///< Map of incomplete jobs. /// How many jobs are used in this query. 1 avoids possible 0 of 0 jobs completed race condition. @@ -288,7 +278,7 @@ class Executive : public std::enable_shared_from_this { /** Execution errors */ util::MultiError _multiError; - std::atomic _requestCount; ///< Count of submitted jobs + std::atomic _requestCount{0}; ///< Count of submitted jobs util::Flag _cancelled{false}; ///< Has execution been cancelled. // Mutexes @@ -347,10 +337,6 @@ class Executive : public std::enable_shared_from_this { /// Weak pointer to the UserQuerySelect object for this query. std::weak_ptr _userQuerySelect; - /// If this is true, there are probably jobs that need to - /// be reassigned to new UberJobs. - std::atomic _failedUberJob{false}; - /// Flag that is set to true when ready to create and run UberJobs. std::atomic _readyToExecute{false}; }; diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index e5df5fc2a..d793e3699 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -45,6 +45,7 @@ class QueryRequest; /// for this base class as it won't be possible to send a JobQuery to a worker without /// putting it in an UberJob first. The UberJob is a wrapper that stores worker contact /// info. +// &&& delete this class as JobQuery and UberJob should no longer have much in common class JobBase : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; @@ -64,7 +65,7 @@ class JobBase : public std::enable_shared_from_this { virtual bool getScanInteractive() const = 0; virtual bool isQueryCancelled() = 0; virtual void callMarkCompleteFunc(bool success) = 0; - virtual void setQueryRequest(std::shared_ptr const& qr) = 0; + //&&&virtual void setQueryRequest(std::shared_ptr const& qr) = 0; virtual std::shared_ptr getExecutive() = 0; virtual std::ostream& dumpOS(std::ostream& os) const; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 9b99f4d9d..5d6c6193a 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -34,7 +34,6 @@ // Qserv headers #include "global/LogContext.h" #include "qdisp/Executive.h" -#include "qdisp/QueryRequest.h" namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.JobQuery"); @@ -63,64 +62,6 @@ JobQuery::~JobQuery() { LOGS(_log, LOG_LVL_WARN, "~JobQuery QID=" << _idStr); } -/** Attempt to run the job on a worker. - * @return - false if it can not setup the job or the maximum number of attempts has been reached. - */ -bool JobQuery::runJob() { // TODO:UJ delete - QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); - LOGS(_log, LOG_LVL_DEBUG, " runJob " << *this); - auto executive = _executive.lock(); - if (executive == nullptr) { - LOGS(_log, LOG_LVL_ERROR, "runJob failed executive==nullptr"); - - return false; - } - bool superfluous = executive->isLimitRowComplete(); - bool cancelled = executive->getCancelled(); - bool handlerReset = _jobDescription->respHandler()->reset(); - if (!(cancelled || superfluous) && handlerReset) { - auto criticalErr = [this, &executive](string const& msg) { - LOGS(_log, LOG_LVL_ERROR, msg << " " << _jobDescription << " Canceling user query!"); - executive->squash(); // This should kill all jobs in this user query. - }; - - LOGS(_log, LOG_LVL_DEBUG, "runJob checking attempt=" << _jobDescription->getAttemptCount()); - lock_guard lock(_rmutex); - if (_jobDescription->getAttemptCount() < executive->getMaxAttempts()) { - bool okCount = _jobDescription->incrAttemptCountScrubResults(); - if (!okCount) { - criticalErr("hit structural max of retries"); - return false; - } - if (!_jobDescription->verifyPayload()) { - criticalErr("bad payload"); - return false; - } - } else { - LOGS(_log, LOG_LVL_DEBUG, "runJob max retries"); - criticalErr("hit maximum number of retries"); - return false; - } - - // At this point we are all set to actually run the query. We create a - // a shared pointer to this object to prevent it from escaping while we - // are trying to start this whole process. We also make sure we record - // whether or not we are in SSI as cancellation handling differs. - // - LOGS(_log, LOG_LVL_TRACE, "runJob calls StartQuery()"); - JobQuery::Ptr jq(dynamic_pointer_cast(shared_from_this())); - _inSsi = true; - if (executive->startQuery(jq)) { - _jobStatus->updateInfo(_idStr, qmeta::JobStatus::REQUEST, "EXEC"); - return true; - } - _inSsi = false; - } - LOGS(_log, (superfluous ? LOG_LVL_DEBUG : LOG_LVL_WARN), - "runJob failed. cancelled=" << cancelled << " reset=" << handlerReset); - return false; -} - /// Cancel response handling. Return true if this is the first time cancel has been called. bool JobQuery::cancel(bool superfluous) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); @@ -130,6 +71,7 @@ bool JobQuery::cancel(bool superfluous) { // If _inSsi is true then this query request has been passed to SSI and // _queryRequestPtr cannot be a nullptr. Cancellation is complicated. bool cancelled = false; + /* &&& if (_inSsi) { LOGS(_log, LOG_LVL_DEBUG, "cancel QueryRequest in progress"); if (_queryRequestPtr->cancel()) { @@ -139,9 +81,10 @@ bool JobQuery::cancel(bool superfluous) { LOGS(_log, LOG_LVL_DEBUG, "QueryRequest could not cancel"); } } + */ if (!cancelled) { ostringstream os; - os << _idStr << " cancel QueryRequest=" << _queryRequestPtr; + os << _idStr << " cancel"; LOGS(_log, LOG_LVL_DEBUG, os.str()); if (!superfluous) { getDescription()->respHandler()->errorFlush(os.str(), -1); diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index a11b628d4..7ce262875 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -62,9 +62,6 @@ class JobQuery : public JobBase { virtual ~JobQuery(); - /// Run this job. - bool runJob(); - QueryId getQueryId() const override { return _qid; } JobId getJobId() const override { return _jobDescription->id(); } std::string const& getPayload() const override; @@ -75,15 +72,6 @@ class JobQuery : public JobBase { qmeta::JobStatus::Ptr getStatus() override { return _jobStatus; } - void setQueryRequest(std::shared_ptr const& qr) { - std::lock_guard lock(_rmutex); - _queryRequestPtr = qr; - } - std::shared_ptr getQueryRequest() { - std::lock_guard lock(_rmutex); - return _queryRequestPtr; - } - void callMarkCompleteFunc(bool success) override; bool cancel(bool superfluous = false); @@ -158,10 +146,6 @@ class JobQuery : public JobBase { ///< _queryRequestPtr, _uberJobId, ///< and _inSsi - // SSI items - std::shared_ptr _queryRequestPtr; - bool _inSsi{false}; - // Cancellation std::atomic _cancelled{false}; ///< Lock to make sure cancel() is only called once. diff --git a/src/qdisp/QueryRequest.cc b/src/qdisp/QueryRequest.cc deleted file mode 100644 index 185065ea0..000000000 --- a/src/qdisp/QueryRequest.cc +++ /dev/null @@ -1,433 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2014-2016 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -/** - * @file - * - * @brief QueryRequest. XrdSsiRequest impl for czar query dispatch - * - * @author Daniel L. Wang, SLAC - */ - -// Class header -#include "qdisp/QdispPool.h" -#include "qdisp/QueryRequest.h" - -// System headers -#include -#include - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "czar/Czar.h" -#include "qdisp/CzarStats.h" -#include "qdisp/UberJob.h" -#include "global/LogContext.h" -#include "proto/worker.pb.h" -#include "qmeta/JobStatus.h" -#include "qdisp/ResponseHandler.h" -#include "util/Bug.h" -#include "util/common.h" -#include "util/InstanceCount.h" -#include "util/Timer.h" - -using namespace std; - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.QueryRequest"); -} - -namespace lsst::qserv::qdisp { - -QueryRequest::QueryRequest(JobBase::Ptr const& job) - : _job(job), - _qid(job->getQueryId()), - _jobid(job->getJobId()), - _jobIdStr(job->getIdStr()), - _qdispPool(_job->getQdispPool()) { - QSERV_LOGCONTEXT_QUERY_JOB(_qid, _jobid); - LOGS(_log, LOG_LVL_TRACE, "New QueryRequest"); -} - -QueryRequest::~QueryRequest() { - QSERV_LOGCONTEXT_QUERY_JOB(_qid, _jobid); - LOGS(_log, LOG_LVL_TRACE, __func__); - if (!_finishedCalled) { - LOGS(_log, LOG_LVL_WARN, __func__ << " cleaning up calling Finished"); - bool ok = Finished(); - if (!ok) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " Finished NOT ok"); - } - } -} - -// content of request data -char* QueryRequest::GetRequest(int& requestLength) { - QSERV_LOGCONTEXT_QUERY_JOB(_qid, _jobid); - lock_guard lock(_finishStatusMutex); - auto jq = _job; - if (_finishStatus != ACTIVE || jq == nullptr) { - LOGS(_log, LOG_LVL_DEBUG, __func__ << " called after job finished (cancelled?)"); - requestLength = 0; - return const_cast(""); - } - requestLength = jq->getPayload().size(); - LOGS(_log, LOG_LVL_DEBUG, "Requesting, payload size: " << requestLength); - // Andy promises that his code won't corrupt it. - return const_cast(jq->getPayload().data()); -} - -// Must not throw exceptions: calling thread cannot trap them. -// Callback function for XrdSsiRequest. -// -bool QueryRequest::ProcessResponse(XrdSsiErrInfo const& eInfo, XrdSsiRespInfo const& rInfo) { - QSERV_LOGCONTEXT_QUERY_JOB(_qid, _jobid); - LOGS(_log, LOG_LVL_DEBUG, "workerName=" << GetEndPoint() << " " << __func__); - string errorDesc = _jobIdStr + " "; - if (isQueryCancelled()) { - LOGS(_log, LOG_LVL_WARN, __func__ << " job already cancelled"); - cancel(); // calls _errorFinish() - return true; - } - - // Make a copy of the _jobQuery shared_ptr in case _jobQuery gets reset by a call to cancel() - auto jq = _job; - { - lock_guard lock(_finishStatusMutex); - if ((_finishStatus != ACTIVE) || (jq == nullptr)) { - LOGS(_log, LOG_LVL_WARN, __func__ << " called after job finished (cancelled?)"); - return true; - } - } - if (eInfo.hasError()) { - ostringstream os; - os << _jobIdStr << __func__ << " request failed " << getSsiErr(eInfo, nullptr) << " " - << GetEndPoint(); - jq->getRespHandler()->errorFlush(os.str(), -1); - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::RESPONSE_ERROR, "SSI"); - _errorFinish(); - return true; - } - - string responseTypeName; // for error reporting - switch (rInfo.rType) { - case XrdSsiRespInfo::isNone: - responseTypeName = "isNone"; - break; - case XrdSsiRespInfo::isData: - if (string(rInfo.buff, rInfo.blen) == "MockResponse") { - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::COMPLETE, "MOCK"); - _finish(); - return true; - } else if (rInfo.blen == 0) { - // Metadata-only responses for the file-based protocol should not have any data - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::RESPONSE_READY, "SSI"); - return _importResultFile(jq); - } - responseTypeName = "isData"; - break; - case XrdSsiRespInfo::isError: - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::RESPONSE_ERROR, "SSI", rInfo.eNum, - string(rInfo.eMsg)); - return _importError(string(rInfo.eMsg), rInfo.eNum); - case XrdSsiRespInfo::isFile: - responseTypeName = "isFile"; - break; - case XrdSsiRespInfo::isStream: - responseTypeName = "isStream"; - break; - default: - responseTypeName = ""; - } - return _importError("Unexpected XrdSsiRespInfo.rType == " + responseTypeName, -1); -} - -/// Retrieve and process a result file using the file-based protocol -/// Uses a copy of JobQuery::Ptr instead of _jobQuery as a call to cancel() would reset _jobQuery. -bool QueryRequest::_importResultFile(JobBase::Ptr const& job) { - // It's possible jq and _jobQuery differ, so need to use jq. - if (job->isQueryCancelled()) { - LOGS(_log, LOG_LVL_WARN, "QueryRequest::_processData job was cancelled."); - _errorFinish(true); - return false; - } - auto jq = std::dynamic_pointer_cast(job); - if (jq == nullptr) { - throw util::Bug(ERR_LOC, string(__func__) + " unexpected pointer type for job"); - } - auto executive = jq->getExecutive(); - if (executive == nullptr || executive->getCancelled() || executive->isLimitRowComplete()) { - if (executive == nullptr || executive->getCancelled()) { - LOGS(_log, LOG_LVL_WARN, "QueryRequest::_processData job was cancelled."); - } else { - int dataIgnored = (executive->incrDataIgnoredCount()); - if ((dataIgnored - 1) % 1000 == 0) { - LOGS(_log, LOG_LVL_INFO, - "QueryRequest::_processData ignoring, enough rows already " << "dataIgnored=" - << dataIgnored); - } - } - _errorFinish(true); - return false; - } - - int messageSize = 0; - const char* message = GetMetadata(messageSize); - - LOGS(_log, LOG_LVL_DEBUG, __func__ << " _jobIdStr=" << _jobIdStr << ", messageSize=" << messageSize); - - proto::ResponseSummary responseSummary; - if (!(responseSummary.ParseFromArray(message, messageSize) && responseSummary.IsInitialized())) { - string const err = "failed to parse the response summary, messageSize=" + to_string(messageSize); - LOGS(_log, LOG_LVL_ERROR, __func__ << " " << err); - throw util::Bug(ERR_LOC, err); - } - uint32_t resultRows = 0; - if (!jq->getDescription()->respHandler()->flush(responseSummary, resultRows)) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " not flushOk"); - _flushError(jq); - return false; - } - _totalRows += resultRows; - - // At this point all data for this job have been read, there's no point in - // having XrdSsi wait for anything. - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::COMPLETE, "COMPLETE"); - _finish(); - - // If the query meets the limit row complete complete criteria, it will start - // squashing superfluous results so the answer can be returned quickly. - executive->addResultRows(_totalRows); - executive->checkLimitRowComplete(); - - return true; -} - -/// Process an incoming error. -bool QueryRequest::_importError(string const& msg, int code) { - auto jq = _job; - { - lock_guard lock(_finishStatusMutex); - if (_finishStatus != ACTIVE || jq == nullptr) { - LOGS(_log, LOG_LVL_WARN, - "QueryRequest::_importError code=" << code << " msg=" << msg << " not passed"); - return false; - } - jq->getRespHandler()->errorFlush(msg, code); - } - _errorFinish(); - return true; -} - -void QueryRequest::ProcessResponseData(XrdSsiErrInfo const& eInfo, char* buff, int blen, bool last) { - string const err = "the method has no use in this implementation of Qserv"; - LOGS(_log, LOG_LVL_ERROR, __func__ << " " << err); - throw util::Bug(ERR_LOC, err); -} - -void QueryRequest::_flushError(JobBase::Ptr const& jq) { - ResponseHandler::Error err = jq->getRespHandler()->getError(); - jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::MERGE_ERROR, "MERGE", err.getCode(), - err.getMsg(), MSG_ERROR); - _errorFinish(true); -} - -/// @return true if QueryRequest cancelled successfully. -bool QueryRequest::cancel() { - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest::cancel"); - { - lock_guard lock(_finishStatusMutex); - if (_cancelled) { - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest::cancel already cancelled, ignoring"); - return false; // Don't do anything if already cancelled. - } - _cancelled = true; - _retried = true; // Prevent retries. - // Only call the following if the job is NOT already done. - if (_finishStatus == ACTIVE) { - auto jq = _job; - if (jq != nullptr) jq->getStatus()->updateInfo(_jobIdStr, qmeta::JobStatus::CANCEL, "CANCEL"); - } - } - return _errorFinish(true); // return true if errorFinish cancelled -} - -/// @return true if this object's JobQuery, or its Executive has been cancelled. -/// It takes time for the Executive to flag all jobs as being cancelled -bool QueryRequest::isQueryCancelled() { - auto jq = _job; - if (jq == nullptr) { - // Need to check if _jobQuery is null due to cancellation. - return isQueryRequestCancelled(); - } - return jq->isQueryCancelled(); -} - -/// @return true if QueryRequest::cancel() has been called. -/// QueryRequest::isQueryCancelled() is a much better indicator of user query cancellation. -bool QueryRequest::isQueryRequestCancelled() { - lock_guard lock(_finishStatusMutex); - return _cancelled; -} - -/// Cleanup pointers so this class can be deleted. -/// This should only be called by _finish or _errorFinish. -void QueryRequest::cleanup() { - LOGS(_log, LOG_LVL_TRACE, "QueryRequest::cleanup()"); - { - lock_guard lock(_finishStatusMutex); - if (_finishStatus == ACTIVE) { - LOGS(_log, LOG_LVL_ERROR, "QueryRequest::cleanup called before _finish or _errorFinish"); - return; - } - } - - // These need to be outside the mutex lock, or you could delete - // _finishStatusMutex before it is unlocked. - // This should reset _jobquery and _keepAlive without risk of either being deleted - // before being reset. - shared_ptr jq(move(_job)); - shared_ptr keep(move(_keepAlive)); -} - -/// Finalize under error conditions and retry or report completion -/// THIS FUNCTION WILL RESULT IN THIS OBJECT BEING DESTROYED, UNLESS there is -/// a local shared pointer for this QueryRequest and/or its owner JobQuery. -/// See QueryRequest::cleanup() -/// @return true if this QueryRequest object had the authority to make changes. -// TODO:UJ Delete QueryRequest class, including this function. -bool QueryRequest::_errorFinish(bool shouldCancel) { - LOGS(_log, LOG_LVL_DEBUG, "_errorFinish() shouldCancel=" << shouldCancel); - - auto jbase = _job; - JobQuery::Ptr jq = dynamic_pointer_cast(jbase); - if (jq == nullptr) { - // TODO:UJ The QueryRequest class will be deleted, so this doen't matter. - UberJob::Ptr uberJob = dynamic_pointer_cast(jbase); - if (uberJob != nullptr) { - throw util::Bug(ERR_LOC, " for _errorFinish to work correctly with UberJob"); - // UberJobs breakup into their JobQueries when they fail and run the jobs directly. - } - return false; - } - - // Normal JobQuery error handling. - { - // Running _errorFinish more than once could cause errors. - lock_guard lock(_finishStatusMutex); - if (_finishStatus != ACTIVE || jq == nullptr) { - // Either _finish or _errorFinish has already been called. - LOGS_DEBUG("_errorFinish() job no longer ACTIVE, ignoring " - << " _finishStatus=" << _finishStatus << " ACTIVE=" << ACTIVE << " jq=" << jq); - return false; - } - _finishStatus = ERROR; - } - - // Make the calls outside of the mutex lock. - LOGS(_log, LOG_LVL_DEBUG, "calling Finished(shouldCancel=" << shouldCancel << ")"); - bool ok = Finished(shouldCancel); - _finishedCalled = true; - if (!ok) { - LOGS(_log, LOG_LVL_ERROR, "QueryRequest::_errorFinish !ok "); - } else { - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest::_errorFinish ok"); - } - - if (!_retried.exchange(true) && !shouldCancel) { - // There's a slight race condition here. _jobQuery::runJob() creates a - // new QueryRequest object which will replace this one in _jobQuery. - // The replacement could show up before this one's cleanup() is called, - // so this will keep this alive until cleanup() is done. - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest::_errorFinish retrying"); - _keepAlive = jq->getQueryRequest(); // shared pointer to this - if (!jq->runJob()) { - // Retry failed, nothing left to try. - LOGS(_log, LOG_LVL_DEBUG, "errorFinish retry failed"); - _callMarkComplete(false); - } - } else { - _callMarkComplete(false); - } - cleanup(); // Reset smart pointers so this object can be deleted. - return true; -} - -/// Finalize under success conditions and report completion. -/// THIS FUNCTION WILL RESULT IN THIS OBJECT BEING DESTROYED, UNLESS there is -/// a local shared pointer for this QueryRequest and/or its owner JobQuery. -/// See QueryRequest::cleanup() -void QueryRequest::_finish() { - LOGS(_log, LOG_LVL_TRACE, "QueryRequest::_finish"); - { - // Running _finish more than once would cause errors. - lock_guard lock(_finishStatusMutex); - if (_finishStatus != ACTIVE) { - // Either _finish or _errorFinish has already been called. - LOGS(_log, LOG_LVL_WARN, "QueryRequest::_finish called when not ACTIVE, ignoring"); - return; - } - _finishStatus = FINISHED; - } - - bool ok = Finished(); - _finishedCalled = true; - if (!ok) { - LOGS(_log, LOG_LVL_ERROR, "QueryRequest::finish Finished() !ok "); - } else { - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest::finish Finished() ok."); - } - _callMarkComplete(true); - cleanup(); -} - -void QueryRequest::_callMarkComplete(bool success) { - if (!_calledMarkComplete.exchange(true)) { - auto jq = _job; - if (jq != nullptr) { - jq->callMarkCompleteFunc(success); - } - } -} - -ostream& operator<<(ostream& os, QueryRequest const& qr) { - os << "QueryRequest " << qr._jobIdStr; - return os; -} - -/// @return The error text and code that SSI set. -/// if eCode != nullptr, it is set to the error code set by SSI. -string QueryRequest::getSsiErr(XrdSsiErrInfo const& eInfo, int* eCode) { - int errNum; - string errText = eInfo.Get(errNum); - if (eCode != nullptr) { - *eCode = errNum; - } - ostringstream os; - os << "SSI_Error(" << errNum << ":" << errText << ")"; - return os.str(); -} - -} // namespace lsst::qserv::qdisp diff --git a/src/qdisp/QueryRequest.h b/src/qdisp/QueryRequest.h deleted file mode 100644 index 1327b4673..000000000 --- a/src/qdisp/QueryRequest.h +++ /dev/null @@ -1,165 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2014-2015 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_QDISP_QUERYREQUEST_H -#define LSST_QSERV_QDISP_QUERYREQUEST_H - -// System headers -#include -#include -#include -#include -#include -#include - -// Third-party headers -#include "XrdSsi/XrdSsiRequest.hh" - -// Local headers -#include "czar/Czar.h" -#include "qdisp/JobQuery.h" -#include "qdisp/QdispPool.h" - -namespace lsst::qserv::qdisp { - -/// Bad response received from SSI API -class BadResponseError : public std::exception { -public: - BadResponseError(std::string const& s_) : std::exception(), s("BadResponseError:" + s_) {} - virtual ~BadResponseError() throw() {} - virtual const char* what() const throw() { return s.c_str(); } - std::string s; -}; - -/// Error in QueryRequest -class RequestError : public std::exception { -public: - RequestError(std::string const& s_) : std::exception(), s("QueryRequest error:" + s_) {} - virtual ~RequestError() throw() {} - virtual const char* what() const throw() { return s.c_str(); } - std::string s; -}; - -/// A client implementation of an XrdSsiRequest that adapts qserv's executing -/// queries to the XrdSsi API. -/// -/// Memory allocation notes: -/// In the XrdSsi API, raw pointers are passed around for XrdSsiRequest objects, -/// and care needs to be taken to avoid deleting the request objects before -/// Finished() is called. Typically, an XrdSsiRequest subclass is allocated with -/// operator new, and passed into XrdSsi. At certain points in the transaction, -/// XrdSsi will call methods in the request object or hand back the request -/// object pointer. XrdSsi ceases interest in the object once the -/// XrdSsiRequest::Finished() completes. Generally, this would mean the -/// QueryRequest should clean itself up after calling Finished(). This requires -/// special care, because there is a cancellation function in the wild that may -/// call into QueryRequest after Finished() has been called. The cancellation -/// code is -/// designed to allow the client requester (elsewhere in qserv) to request -/// cancellation without knowledge of XrdSsi, so the QueryRequest registers a -/// cancellation function with its client that maintains a pointer to the -/// QueryRequest. After Finished(), the cancellation function must be prevented -/// from accessing the QueryRequest instance. -// TODO:UJ delete this class -class QueryRequest : public XrdSsiRequest, public std::enable_shared_from_this { -public: - typedef std::shared_ptr Ptr; - - static Ptr create(std::shared_ptr const& jobBase) { - Ptr newQueryRequest(new QueryRequest(jobBase)); - return newQueryRequest; - } - - virtual ~QueryRequest(); - - /// Called by SSI to get the request payload - /// @return content of request data - char* GetRequest(int& requestLength) override; - - /// Called by SSI to release the allocated request payload. As we don't - /// own the buffer, so we can't release it. Therefore, we accept the - /// default implementation that does nothing. - /// void RelRequestBuffer() override; - - /// Called by SSI when a response is ready - /// precondition: rInfo.rType != isNone - bool ProcessResponse(XrdSsiErrInfo const& eInfo, XrdSsiRespInfo const& rInfo) override; - - /// Called by SSI when new data is available. - void ProcessResponseData(XrdSsiErrInfo const& eInfo, char* buff, int blen, bool last) override; - - bool cancel(); - bool isQueryCancelled(); - bool isQueryRequestCancelled(); - void doNotRetry() { _retried.store(true); } - std::string getSsiErr(XrdSsiErrInfo const& eInfo, int* eCode); - void cleanup(); ///< Must be called when this object is no longer needed. - - friend std::ostream& operator<<(std::ostream& os, QueryRequest const& r); - -private: - // Private constructor to safeguard enable_shared_from_this construction. - QueryRequest(JobBase::Ptr const& job); - - /// Inform the Executive that this query completed, and call MarkCompleteFunc only once. - /// This should only be called from _finish() or _errorFinish. - void _callMarkComplete(bool success); - bool _importResultFile(JobBase::Ptr const& jq); - bool _importError(std::string const& msg, int code); - bool _errorFinish(bool stopTrying = false); - void _finish(); - void _flushError(JobBase::Ptr const& jq); - - /// Job information. Not using a weak_ptr as Executive could drop its JobBase::Ptr before we're done with - /// it. A call to cancel() could reset _job early, so copy or protect _job with _finishStatusMutex as - /// needed. If (_finishStatus == ACTIVE) _job should be good. - std::shared_ptr _job; - - std::atomic _retried{false}; ///< Protect against multiple retries of _jobQuery from a - /// single QueryRequest. - std::atomic _calledMarkComplete{false}; ///< Protect against multiple calls to MarkCompleteFunc - /// from a single QueryRequest. - - std::mutex _finishStatusMutex; ///< used to protect _cancelled, _finishStatus, and _jobQuery. - enum FinishStatus { ACTIVE, FINISHED, ERROR } _finishStatus{ACTIVE}; // _finishStatusMutex - bool _cancelled{false}; ///< true if cancelled, protected by _finishStatusMutex. - - std::shared_ptr _keepAlive; ///< Used to keep this object alive during race condition. - QueryId _qid = 0; // for logging - JobId _jobid = -1; // for logging - std::string _jobIdStr{QueryIdHelper::makeIdStr(0, 0, true)}; ///< for debugging only. - - std::atomic _finishedCalled{false}; - - QdispPool::Ptr _qdispPool; - - int64_t _totalRows = 0; ///< number of rows in query added to the result table. - - std::atomic _rowsIgnored{0}; ///< Limit log messages about rows being ignored. - std::atomic _respCount{0}; ///< number of responses created -}; - -std::ostream& operator<<(std::ostream& os, QueryRequest const& r); - -} // namespace lsst::qserv::qdisp - -#endif // LSST_QSERV_QDISP_QUERYREQUEST_H diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 16665a235..9afdb4f75 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -202,8 +202,6 @@ void UberJob::_unassignJobs() { cName(__func__) << " job=" << jid << " attempts=" << job->getAttemptCount()); } _jobs.clear(); - bool const setFlag = true; - exec->setFlagFailedUberJob(setFlag); } bool UberJob::isQueryCancelled() { diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index bfd8cb877..04c59c807 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -80,10 +80,6 @@ class UberJob : public JobBase { void callMarkCompleteFunc(bool success) override; ///< call markComplete for all jobs in this UberJob. std::shared_ptr getExecutive() override { return _executive.lock(); } - void setQueryRequest(std::shared_ptr const& qr) override { - ; // Do nothing as QueryRequest is only needed for xrootd. TODO:UJ delete function. - } - /// Return false if not ok to set the status to newState, otherwise set the state for /// this UberJob and all jobs it contains to newState. /// This is used both to set status and prevent the system from repeating operations diff --git a/src/qdisp/XrdSsiMocks.cc b/src/qdisp/XrdSsiMocks.cc deleted file mode 100644 index bbfb24361..000000000 --- a/src/qdisp/XrdSsiMocks.cc +++ /dev/null @@ -1,312 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015-2016 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - * - * @author John Gates, SLAC - */ - -// System headers -#include -#include -#include -#include -#include -#include -#include -#include - -// Third party headers -#include "XrdSsi/XrdSsiErrInfo.hh" -#include "XrdSsi/XrdSsiResponder.hh" -#include "XrdSsi/XrdSsiStream.hh" - -// LSST headers -#include "lsst/log/Log.h" -#include "proto/worker.pb.h" -#include "util/threadSafe.h" - -// Qserv headers -#include "qdisp/Executive.h" -#include "qdisp/QueryRequest.h" -#include "qdisp/XrdSsiMocks.h" - -using namespace std; - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.XrdSsiMock"); - -lsst::qserv::util::FlagNotify _go(true); - -std::atomic canCount(0); -std::atomic finCount(0); -std::atomic reqCount(0); -std::atomic totCount(0); - -bool _aOK = true; - -enum RespType { RESP_BADREQ, RESP_DATA, RESP_ERROR, RESP_ERRNR, RESP_STREAM, RESP_STRERR }; - -class Agent : public XrdSsiResponder, public XrdSsiStream { -public: - void Finished(XrdSsiRequest& rqstR, XrdSsiRespInfo const& rInfo, bool cancel) override { - const char* how = (cancel ? " cancelled" : ""); - LOGS(_log, LOG_LVL_DEBUG, "Finished: " << _rNum << " rName=" << _rName << how); - _rrMutex.lock(); - UnBindRequest(); - if (cancel) canCount++; - finCount++; - _isFIN = true; - if (_active) { - _rrMutex.unlock(); - } else { - _rrMutex.unlock(); - delete this; - } - } - - void Reply(RespType rType) { - _go.wait(true); - - // We may have been cancelled before being able to reply - // - if (_isCancelled(true)) return; // we are locked now - - // Do requested reply - // - switch (rType) { - case RESP_DATA: - _ReplyData(); - break; - case RESP_ERRNR: - _reqP->doNotRetry(); - // Fallthrough - case RESP_ERROR: - _ReplyError(); - break; - case RESP_STRERR: - _noData = true; - _reqP->doNotRetry(); // Kill retries on stream errors - _ReplyStream(); - break; - default: - _reqP->doNotRetry(); - _ReplyError("Bad mock request!", 13); - break; - } - _isCancelled(false); - } - - bool SetBuff(XrdSsiErrInfo& eRef, char* buff, int blen) override { - // We may have been cancelled while waiting - // - if (_isCancelled(true)) return false; - std::thread(&Agent::_StrmResp, this, &eRef, buff, blen).detach(); - _rrMutex.unlock(); - return true; - } - - Agent(lsst::qserv::qdisp::QueryRequest* rP, std::string const& rname, int rnum) - : XrdSsiStream(XrdSsiStream::isPassive), - _reqP(rP), - _rName(rname), - _rNum(rnum), - _noData(true), - _isFIN(false), - _active(true) { - // Initialize a null message we will return as a response - // - _responseSummary = - google::protobuf::Arena::CreateMessage(_arena.get()); - lsst::qserv::proto::ResponseSummary* responseSummary = _responseSummary; - responseSummary->set_wname("localhost"); - std::string str; - responseSummary->SerializeToString(&str); - _msgBuf = str; - _bOff = 0; - _bLen = _msgBuf.size(); - } - - ~Agent() {} - -private: - bool _isCancelled(bool activate) { - if (activate) _rrMutex.lock(); - if (_isFIN) { - _rrMutex.unlock(); - delete this; - return true; - } - _active = activate; - if (!activate) _rrMutex.unlock(); - return false; - } - - void _ReplyData() { - _rspBuf = "MockResponse"; - SetResponse(_rspBuf.data(), _rspBuf.size()); - } - - void _ReplyError(const char* eMsg = "Mock Request Ignored!", int eNum = 17) { - SetErrResponse(eMsg, eNum); - } - - void _ReplyStream() { - auto stat = _setMetaData(_msgBuf.size()); - if (stat != Status::wasPosted) { - LOGS(_log, LOG_LVL_ERROR, "Agent::_ReplyStream _setMetadata failed " << stat); - } - SetResponse(this); - } - - void _StrmResp(XrdSsiErrInfo* eP, char* buff, int blen) { - std::cerr << "Stream: cleint asks for " << blen << " bytes, have " << _bLen << '\n' << std::flush; - bool last; - - // Check for cancellation while we were waiting - // - if (_isCancelled(true)) return; - - // Either reply with an error or actual data - // - if (_noData) { - blen = -17; - last = true; - eP->Set("Mock stream error!", 17); - } else { - if (_bLen <= blen) { - memcpy(buff, _msgBuf.data() + _bOff, _bLen); - blen = _bLen; - _bLen = 0; - last = true; - } else { - memcpy(buff, _msgBuf.data() + _bOff, blen); - _bOff += blen; - _bLen -= blen; - last = false; - } - } - _reqP->ProcessResponseData(*eP, buff, blen, last); - _isCancelled(false); - } - - Status _setMetaData(size_t sz) { - string str; - _responseSummary->SerializeToString(&str); - _metadata = str; - return SetMetadata(_metadata.data(), _metadata.size()); - } - - std::recursive_mutex _rrMutex; - lsst::qserv::qdisp::QueryRequest* _reqP; - std::string _rName; - std::string _rspBuf; - std::string _msgBuf; - int _bOff; - int _bLen; - int _rNum; - bool _noData; - bool _isFIN; - bool _active; - std::string _metadata; - lsst::qserv::proto::ResponseSummary* _responseSummary; - std::unique_ptr _arena{make_unique()}; -}; -} // namespace - -namespace lsst::qserv::qdisp { - -std::string XrdSsiServiceMock::_myRName; - -int XrdSsiServiceMock::getCount() { return totCount; } - -int XrdSsiServiceMock::getCanCount() { return canCount; } - -int XrdSsiServiceMock::getFinCount() { return finCount; } - -int XrdSsiServiceMock::getReqCount() { return reqCount; } - -bool XrdSsiServiceMock::isAOK() { return _aOK; } - -void XrdSsiServiceMock::Reset() { - canCount = 0; - finCount = 0; - reqCount = 0; -} - -void XrdSsiServiceMock::setGo(bool go) { _go.exchangeNotify(go); } - -void XrdSsiServiceMock::ProcessRequest(XrdSsiRequest& reqRef, XrdSsiResource& resRef) { - static struct { - const char* cmd; - RespType rType; - } reqTab[] = {{"respdata", RESP_DATA}, {"resperror", RESP_ERROR}, {"resperrnr", RESP_ERRNR}, - {"respstream", RESP_STREAM}, {"respstrerr", RESP_STRERR}, {0, RESP_BADREQ}}; - - int reqNum = totCount++; - - // Check if we should verify the resource name - // - if (_myRName.size() && _myRName != resRef.rName) { - LOGS_DEBUG("Expected rname " << _myRName << " got " << resRef.rName << " from req #" << reqNum); - _aOK = false; - } - - // Get the query request object for this request and process it. - QueryRequest* r = dynamic_cast(&reqRef); - if (r) { - Agent* aP = new Agent(r, resRef.rName, reqNum); - RespType doResp; - aP->BindRequest(reqRef); - - // Get the request data and setup to handle request. Make sure the - // request string is null terminated (it should be). - // - std::string reqStr; - int reqLen; - const char* reqData = r->GetRequest(reqLen); - if (reqData != nullptr) reqStr.assign(reqData, reqLen); - reqData = reqStr.c_str(); - - // Convert request to response type - // - int i = 0; - while (reqTab[i].cmd && strcmp(reqTab[i].cmd, reqData)) i++; - if (reqTab[i].cmd) { - doResp = reqTab[i].rType; - } else { - LOGS_DEBUG("Unknown request '" << reqData << "' from req #" << reqNum); - _aOK = false; - doResp = RESP_BADREQ; - } - - // Release the request buffer (typically a no-op) - // - if (reqLen != 0) r->ReleaseRequestBuffer(); - - // Schedule a response - // - reqCount++; - std::thread(&Agent::Reply, aP, doResp).detach(); - } -} - -} // namespace lsst::qserv::qdisp diff --git a/src/qdisp/XrdSsiMocks.h b/src/qdisp/XrdSsiMocks.h deleted file mode 100644 index 61cad5b73..000000000 --- a/src/qdisp/XrdSsiMocks.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015-2016 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - * - * @author: John Gates, SLAC (heavily modified by Andrew Hanushevsky, SLAC) - */ - -#ifndef LSST_QSERV_QDISP_XRDSSIMOCKS_H -#define LSST_QSERV_QDISP_XRDSSIMOCKS_H - -// External headers -#include "XrdSsi/XrdSsiRequest.hh" -#include "XrdSsi/XrdSsiResource.hh" -#include "XrdSsi/XrdSsiService.hh" - -// Local headers - -namespace lsst::qserv::qdisp { - -class Executive; - -/** A simplified version of XrdSsiService for testing qserv. - */ -class XrdSsiServiceMock : public XrdSsiService { -public: - void ProcessRequest(XrdSsiRequest &reqRef, XrdSsiResource &resRef) override; - - XrdSsiServiceMock(Executive *executive) {}; - - virtual ~XrdSsiServiceMock() {} - - static int getCount(); - - static int getCanCount(); - - static int getFinCount(); - - static int getReqCount(); - - static bool isAOK(); - - static void Reset(); - - static void setGo(bool go); - - static void setRName(std::string const &rname) { _myRName = rname; } - -private: - static std::string _myRName; -}; - -} // namespace lsst::qserv::qdisp - -#endif diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 74483ab39..4d539fa93 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -40,9 +40,7 @@ #include "global/ResourceUnit.h" #include "qdisp/Executive.h" #include "qdisp/JobQuery.h" -#include "qdisp/QueryRequest.h" #include "qdisp/SharedResources.h" -#include "qdisp/XrdSsiMocks.h" #include "qmeta/MessageStore.h" #include "qproc/ChunkQuerySpec.h" #include "qproc/TaskMsgFactory.h" @@ -169,7 +167,7 @@ class SetupTest { SetupTest(const char* request) { qrMsg = request; - qdisp::XrdSsiServiceMock::Reset(); + //&&& qdisp::XrdSsiServiceMock::Reset(); str = qdisp::ExecutiveConfig::getMockStr(); conf = std::make_shared(str, 0); // No updating of QMeta. ms = std::make_shared(); @@ -200,7 +198,7 @@ BOOST_AUTO_TEST_CASE(Executive) { int jobs = 0; _log.setLevel(LOG_LVL_DEBUG); // Ugly but boost test suite forces this std::thread timeoutT(&timeoutFunc, std::ref(done), millisInt); - qdisp::XrdSsiServiceMock::setRName("/chk/Mock/1234"); + //&&& qdisp::XrdSsiServiceMock::setRName("/chk/Mock/1234"); // Test single instance { @@ -234,17 +232,19 @@ BOOST_AUTO_TEST_CASE(Executive) { LOGS_DEBUG("Executive detect non-empty job queue test"); SetupTest tEnv("respdata"); SequentialInt sequence(0); - qdisp::XrdSsiServiceMock::setGo(false); + //&&&qdisp::XrdSsiServiceMock::setGo(false); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 5); jobs += 5; + /* &&& while (qdisp::XrdSsiServiceMock::getCount() < jobs) { LOGS_DEBUG("waiting for _count(" << qdisp::XrdSsiServiceMock::getCount() << ") == jobs(" << jobs << ")"); usleep(10000); } + */ BOOST_CHECK(tEnv.ex->getEmpty() == false); - qdisp::XrdSsiServiceMock::setGo(true); + //&&&qdisp::XrdSsiServiceMock::setGo(true); LOGS_DEBUG("ex->joining()"); tEnv.ex->join(); LOGS_DEBUG("ex->join() joined"); @@ -280,8 +280,8 @@ BOOST_AUTO_TEST_CASE(QueryRequest) { tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->join(); BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() > 1); // Retried, eh? - BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == qdisp::XrdSsiServiceMock::getReqCount()); + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() > 1); // Retried, eh? + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == qdisp::XrdSsiServiceMock::getReqCount()); } { @@ -293,7 +293,7 @@ BOOST_AUTO_TEST_CASE(QueryRequest) { tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->join(); BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); } { @@ -306,7 +306,7 @@ BOOST_AUTO_TEST_CASE(QueryRequest) { tEnv.ex->join(); LOGS_DEBUG("tEnv.jqTest->...state = " << tEnv.jqTest->getStatus()->getInfo().state); BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); // No retries! + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); // No retries! } // We wish we could do the stream response with no results test but the @@ -314,7 +314,7 @@ BOOST_AUTO_TEST_CASE(QueryRequest) { // So, we've commented this out but the framework exists modulo the needed // responses (see XrdSsiMocks::Agent). So, this gets punted into the // integration test (too bad). - /* + /* &&& check if this is possible { LOGS_DEBUG("QueryRequest stream with no results test"); SetupTest tEnv("respstream"); @@ -335,42 +335,42 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { { LOGS_DEBUG("ExecutiveCancel: squash it test"); SetupTest tEnv("respdata"); - qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash + //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->squash(); - qdisp::XrdSsiServiceMock::setGo(true); + //&&&qdisp::XrdSsiServiceMock::setGo(true); usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); BOOST_CHECK(tEnv.jqTest->isQueryCancelled() == true); // Note that the query might not have actually called ProcessRequest() // but if it did, then it must have called Finished() with cancel. // - BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); } // Test that multiple JobQueries are cancelled. { LOGS_DEBUG("ExecutiveCancel: squash 20 test"); SetupTest tEnv("respdata"); - qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash + //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 20); tEnv.ex->squash(); tEnv.ex->squash(); // check that squashing twice doesn't cause issues. - qdisp::XrdSsiServiceMock::setGo(true); + //&&&qdisp::XrdSsiServiceMock::setGo(true); usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); // Note that the cancel count might not be 20 as some queries will cancel // themselves before they get around to issuing ProcessRequest(). // - BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); } } BOOST_AUTO_TEST_CASE(ServiceMock) { // Verify that our service object did not see anything unusual. - BOOST_CHECK(qdisp::XrdSsiServiceMock::isAOK()); + //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::isAOK()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index a4a7557ab..0e73f664d 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -281,8 +281,10 @@ bool QueryRunner::_dispatchChannel() { if (taskSched != nullptr) { taskSched->histTimeOfRunningTasks->addEntry(primeT.getElapsed()); LOGS(_log, LOG_LVL_DEBUG, "QR " << taskSched->histTimeOfRunningTasks->getString("run")); + LOGS(_log, LOG_LVL_WARN, "&&&DASH QR " << taskSched->histTimeOfRunningTasks->getString("run")); } else { LOGS(_log, LOG_LVL_ERROR, "QR runtaskSched == nullptr"); + LOGS(_log, LOG_LVL_ERROR, "&&&DASH QR runtaskSched == nullptr"); } double runTimeSeconds = primeT.getElapsed(); double subchunkRunTimeSeconds = subChunkT.getElapsed(); diff --git a/src/xrdsvc/ChannelStream.h b/src/xrdsvc/ChannelStream.h index ee2de6005..61c8777e7 100644 --- a/src/xrdsvc/ChannelStream.h +++ b/src/xrdsvc/ChannelStream.h @@ -40,7 +40,7 @@ namespace lsst::qserv::xrdsvc { /// ChannelStream is an implementation of an XrdSsiStream that accepts /// SendChannel streamed data. -class ChannelStream : public XrdSsiStream { +class ChannelStream : public XrdSsiStream { // &&& delete public: ChannelStream(); virtual ~ChannelStream(); From 85461dcf3cb25f2438a3b97af558d26e42916171 Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 2 Aug 2024 16:00:21 -0700 Subject: [PATCH 02/22] Removed unnecessary code. --- src/ccontrol/UserQuerySelect.cc | 10 -- src/proto/CMakeLists.txt | 10 -- src/proto/FakeProtocolFixture.h | 92 ---------------- src/proto/ProtoImporter.h | 63 ----------- src/proto/ScanTableInfo.h | 15 +-- src/proto/testProtocol.cc | 183 -------------------------------- src/proto/worker.proto | 49 +-------- src/qdisp/Executive.cc | 8 -- src/qdisp/Executive.h | 11 -- src/qdisp/JobBase.h | 3 +- src/qdisp/JobDescription.cc | 32 +----- src/qdisp/JobDescription.h | 15 +-- src/qdisp/JobQuery.cc | 16 +-- src/qdisp/JobQuery.h | 2 +- src/qdisp/UberJob.cc | 1 - src/qdisp/UberJob.h | 4 +- src/qdisp/testQDisp.cc | 2 + src/qproc/TaskMsgFactory.cc | 160 ++-------------------------- src/qproc/TaskMsgFactory.h | 20 +--- src/rproc/InfileMerger.cc | 1 - src/rproc/testProtoRowBuffer.cc | 1 - src/wbase/MsgProcessor.h | 8 +- src/wbase/Task.cc | 154 --------------------------- src/wbase/Task.h | 18 +--- src/wbase/UberJobData.cc | 10 +- src/wcontrol/Foreman.h | 4 +- src/wdb/CMakeLists.txt | 2 - src/wdb/ChunkResource.cc | 22 +--- src/wdb/QuerySql.cc | 133 ----------------------- src/wdb/QuerySql.h | 72 ------------- src/wdb/QuerySql_Batch.h | 82 -------------- src/wdb/testQueryRunner.cc | 161 ++++++++++++++++++++++++++++ src/wdb/testQuerySql.cc | 2 + src/wsched/testSchedulers.cc | 30 +++++- src/xrdsvc/SsiRequest.cc | 80 ++------------ src/xrdsvc/SsiRequest.h | 2 +- 36 files changed, 240 insertions(+), 1238 deletions(-) delete mode 100644 src/proto/FakeProtocolFixture.h delete mode 100644 src/proto/ProtoImporter.h delete mode 100644 src/proto/testProtocol.cc delete mode 100644 src/wdb/QuerySql.cc delete mode 100644 src/wdb/QuerySql.h delete mode 100644 src/wdb/QuerySql_Batch.h diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 7ca22b295..0fca556f7 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -86,7 +86,6 @@ #include "global/constants.h" #include "global/LogContext.h" #include "proto/worker.pb.h" -#include "proto/ProtoImporter.h" #include "qdisp/Executive.h" #include "qdisp/JobQuery.h" #include "qmeta/MessageStore.h" @@ -120,15 +119,6 @@ using namespace std; namespace lsst::qserv { -/// A class that can be used to parameterize a ProtoImporter for -/// debugging purposes -class ProtoPrinter { -public: - ProtoPrinter() {} - virtual void operator()(std::shared_ptr m) { std::cout << "Got taskmsg ok"; } - virtual ~ProtoPrinter() {} -}; - //////////////////////////////////////////////////////////////////////// // UserQuerySelect implementation namespace ccontrol { diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index c9c7a10e5..925976832 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -19,13 +19,3 @@ target_link_libraries(proto PUBLIC protobuf ) -add_executable(testProtocol testProtocol.cc) - -target_link_libraries(testProtocol - proto - crypto - Boost::unit_test_framework -) - -add_test(NAME testProtocol COMMAND testProtocol) - diff --git a/src/proto/FakeProtocolFixture.h b/src/proto/FakeProtocolFixture.h deleted file mode 100644 index e4c232eda..000000000 --- a/src/proto/FakeProtocolFixture.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_PROTO_FAKEPROTOCOLFIXTURE_H -#define LSST_QSERV_PROTO_FAKEPROTOCOLFIXTURE_H - -// System headers -#include -#include - -namespace lsst::qserv::proto { - -/// FakeProtocolFixture is a utility class containing code for making fake -/// versions of the protobufs messages used in Qserv. Its intent was -/// only to be used for test code. -class FakeProtocolFixture { -public: - FakeProtocolFixture() : _counter(0) {} - - TaskMsg* makeTaskMsg() { - TaskMsg* t(new TaskMsg()); - t->set_chunkid(20 + _counter); - t->set_db("elephant"); - t->set_jobid(0); - t->set_queryid(49); - t->set_scaninteractive(true); - - auto sTbl = t->add_scantable(); - sTbl->set_db("orange"); - sTbl->set_table("cart"); - sTbl->set_lockinmemory(false); - sTbl->set_scanrating(1); - - sTbl = t->add_scantable(); - sTbl->set_db("plum"); - sTbl->set_table("bike"); - sTbl->set_lockinmemory(false); - sTbl->set_scanrating(1); - - for (int i = 0; i < 3; ++i) { - TaskMsg::Fragment* f = t->add_fragment(); - f->add_query("Hello, this is a query."); - addSubChunk(*f, 100 + i); - f->set_resulttable("r_341"); - } - ++_counter; - return t; - } - - void addSubChunk(TaskMsg_Fragment& f, int scId) { - TaskMsg_Subchunk* s; - if (!f.has_subchunks()) { - TaskMsg_Subchunk subc; - // f.add_scgroup(); // How do I add optional objects? - subc.set_database("subdatabase_default"); - proto::TaskMsg_Subchunk_DbTbl* dbTbl = subc.add_dbtbl(); - dbTbl->set_db("subdatabase"); - dbTbl->set_tbl("subtable"); - f.mutable_subchunks()->CopyFrom(subc); - s = f.mutable_subchunks(); - } - s = f.mutable_subchunks(); - s->add_id(scId); - } - -private: - int _counter; -}; - -} // namespace lsst::qserv::proto - -#endif // #define LSST_QSERV_PROTO_FAKEPROTOCOLFIXTURE_H diff --git a/src/proto/ProtoImporter.h b/src/proto/ProtoImporter.h deleted file mode 100644 index 4173d7cfe..000000000 --- a/src/proto/ProtoImporter.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015-2017 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_PROTO_PROTOIMPORTER_H -#define LSST_QSERV_PROTO_PROTOIMPORTER_H - -// System headers -#include -#include - -namespace lsst::qserv::proto { - -/// ProtoImporter -/// Minimal-copy import of an arbitrary proto msg from a raw buffer. -/// Example: -/// struct TaskMsgAcceptor : public ProtoImporter { -/// virtual void operator()(std::shared_ptr m) { ...} -/// }; -/// ProtoImporter p(std::shared_ptr()); -/// p(data,size); // calls operator() defined above. -template -class ProtoImporter { -public: - ProtoImporter() {} - - bool messageAcceptable(std::string const& msg) { - Msg m; - return setMsgFrom(m, msg.data(), msg.size()); - } - - static bool setMsgFrom(Msg& m, char const* buf, int bufLen) { - // For dev/debugging: accepts a partially-formed message - // bool ok = m.ParsePartialFromArray(buf, bufLen); - - // Accept only complete, compliant messages. - bool ok = m.ParseFromArray(buf, bufLen); - return ok && m.IsInitialized(); - } -}; - -} // namespace lsst::qserv::proto - -#endif // #define LSST_QSERV_PROTO_PROTOIMPORTER_H diff --git a/src/proto/ScanTableInfo.h b/src/proto/ScanTableInfo.h index f2dacec61..bb362c51d 100644 --- a/src/proto/ScanTableInfo.h +++ b/src/proto/ScanTableInfo.h @@ -35,29 +35,16 @@ namespace lsst::qserv::proto { /// Structure to store shared scan information for a single table. /// -struct ScanTableInfo { +struct ScanTableInfo { // &&& check if still useful using ListOf = std::vector; ScanTableInfo() = default; ScanTableInfo(std::string const& db_, std::string const& table_) : db(db_), table(table_) {} ScanTableInfo(std::string const& db_, std::string const& table_, bool lockInMemory_, int scanRating_) : db{db_}, table{table_}, lockInMemory{lockInMemory_}, scanRating{scanRating_} {} - ScanTableInfo(TaskMsg_ScanTable const& scanTbl) - : db{scanTbl.db()}, - table{scanTbl.table()}, - lockInMemory{scanTbl.lockinmemory()}, - scanRating{scanTbl.scanrating()} {} ScanTableInfo(ScanTableInfo const&) = default; - /// Copy contents of this object into a TaskMsg_ScanTable object. - void copyToScanTable(TaskMsg_ScanTable* msgScanTbl) const { - msgScanTbl->set_db(db); - msgScanTbl->set_table(table); - msgScanTbl->set_lockinmemory(lockInMemory); - msgScanTbl->set_scanrating(scanRating); - } - int compare(ScanTableInfo const& rhs) const; std::string db; diff --git a/src/proto/testProtocol.cc b/src/proto/testProtocol.cc deleted file mode 100644 index 175eeeb98..000000000 --- a/src/proto/testProtocol.cc +++ /dev/null @@ -1,183 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2011-2016 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// System headers -#include -#include -#include -#include -#include - -// Third-party headers -#include -#include - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "proto/ScanTableInfo.h" -#include "proto/worker.pb.h" - -#include "proto/FakeProtocolFixture.h" - -// Boost unit test header -#define BOOST_TEST_MODULE Protocol_1 -#include - -namespace test = boost::test_tools; -namespace gio = google::protobuf::io; - -using namespace lsst::qserv; - -struct ProtocolFixture : public lsst::qserv::proto::FakeProtocolFixture { - ProtocolFixture(void) : FakeProtocolFixture(), counter(0) {} - ~ProtocolFixture(void) {} - - bool compareTaskMsgs(lsst::qserv::proto::TaskMsg& t1, lsst::qserv::proto::TaskMsg& t2) { - bool nonFragEq = (t1.chunkid() == t2.chunkid()) && (t1.db() == t2.db()); - bool sTablesEq = t1.scantable_size() == t2.scantable_size(); - for (int i = 0; i < t1.scantable_size(); ++i) { - auto const& sTbl1 = t1.scantable(i); - auto const& sTbl2 = t2.scantable(i); - bool eq = (sTbl1.db().compare(sTbl2.db()) == 0 && sTbl1.table() == sTbl2.table() && - sTbl1.lockinmemory() == sTbl2.lockinmemory() && - sTbl1.scanrating() == sTbl2.scanrating()); - sTablesEq = sTablesEq && eq; - } - - bool fEqual = (t1.fragment_size() == t2.fragment_size()); - for (int i = 0; i < t1.fragment_size(); ++i) { - fEqual = fEqual && compareFragment(t1.fragment(i), t2.fragment(i)); - } - return nonFragEq && fEqual && sTablesEq; - } - - bool compareSubchunk(lsst::qserv::proto::TaskMsg_Subchunk const& s1, - lsst::qserv::proto::TaskMsg_Subchunk const& s2) { - if (s1.database() != s2.database()) { - return false; - } - if (s1.dbtbl_size() != s2.dbtbl_size()) { - return false; - } - for (int i = 0; i < s1.dbtbl_size(); ++i) { - if (s1.dbtbl(i).db() != s2.dbtbl(i).db() && s1.dbtbl(i).tbl() != s2.dbtbl(i).tbl()) return false; - } - if (s1.id_size() != s2.id_size()) { - return false; - } - for (int i = 0; i < s1.id_size(); ++i) { - if (s1.id(i) != s2.id(i)) return false; - } - return true; - } - - bool compareFragment(lsst::qserv::proto::TaskMsg_Fragment const& f1, - lsst::qserv::proto::TaskMsg_Fragment const& f2) { - bool qEqual = true; - if (f1.query_size() == f2.query_size()) { - for (int i = 0; i < f1.query_size(); ++i) { - if (f1.query(i) != f2.query(i)) return false; - } - } else { - return false; - } - bool sEqual = true; - if (f1.has_subchunks()) { - if (f2.has_subchunks()) { - sEqual = sEqual && compareSubchunk(f1.subchunks(), f2.subchunks()); - } else { - sEqual = false; - } - } else if (f2.has_subchunks()) { - sEqual = false; - } - return qEqual && sEqual; - } - - int counter; -}; - -BOOST_FIXTURE_TEST_SUITE(ProtocolTestSuite, ProtocolFixture) - -BOOST_AUTO_TEST_CASE(TaskMsgMsgSanity) { - GOOGLE_PROTOBUF_VERIFY_VERSION; - std::stringstream ss; - std::unique_ptr t1(makeTaskMsg()); - BOOST_CHECK(t1.get()); - t1->SerializeToOstream(&ss); - - std::string blah = ss.str(); - std::stringstream ss2(blah); - std::unique_ptr t2(new lsst::qserv::proto::TaskMsg()); - BOOST_CHECK(t1.get()); - t2->ParseFromIstream(&ss2); - BOOST_CHECK(compareTaskMsgs(*t1, *t2)); -} - -BOOST_AUTO_TEST_CASE(ScanTableInfo) { - lsst::qserv::proto::ScanTableInfo stiA{"dba", "fruit", false, 1}; - lsst::qserv::proto::ScanTableInfo stiB{"dba", "fruit", true, 1}; - BOOST_CHECK(stiA.compare(stiB) < 0); - BOOST_CHECK(stiB.compare(stiA) > 0); - BOOST_CHECK(stiA.compare(stiA) == 0); - BOOST_CHECK(stiB.compare(stiB) == 0); - - lsst::qserv::proto::ScanTableInfo stiC{"dba", "fruit", true, 1}; - lsst::qserv::proto::ScanTableInfo stiD{"dba", "fruit", true, 2}; - BOOST_CHECK(stiC.compare(stiD) < 0); - BOOST_CHECK(stiD.compare(stiC) > 0); - BOOST_CHECK(stiC.compare(stiC) == 0); - BOOST_CHECK(stiD.compare(stiD) == 0); - - lsst::qserv::proto::ScanTableInfo stiE{"dba", "fruit", true, 2}; - lsst::qserv::proto::ScanTableInfo stiF{"dbb", "fruit", true, 2}; - BOOST_CHECK(stiE.compare(stiF) < 0); - BOOST_CHECK(stiF.compare(stiE) > 0); - BOOST_CHECK(stiE.compare(stiE) == 0); - BOOST_CHECK(stiF.compare(stiF) == 0); - - lsst::qserv::proto::ScanTableInfo stiG{"dbb", "fruit", true, 2}; - lsst::qserv::proto::ScanTableInfo stiH{"dbb", "veggie", true, 2}; - BOOST_CHECK(stiG.compare(stiH) < 0); - BOOST_CHECK(stiH.compare(stiG) > 0); - BOOST_CHECK(stiG.compare(stiG) == 0); - BOOST_CHECK(stiH.compare(stiH) == 0); - - lsst::qserv::proto::ScanTableInfo::ListOf list = {stiE, stiH, stiC, stiD, stiB, stiA, stiG, stiF}; - lsst::qserv::proto::ScanInfo scanInfo; - scanInfo.infoTables = list; - scanInfo.sortTablesSlowestFirst(); - int j = 0; - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiH) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiG) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiF) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiE) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiD) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiC) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiB) == 0); - BOOST_CHECK(scanInfo.infoTables[j++].compare(stiA) == 0); -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/src/proto/worker.proto b/src/proto/worker.proto index 76d607997..4ef2ae4e7 100644 --- a/src/proto/worker.proto +++ b/src/proto/worker.proto @@ -29,53 +29,6 @@ option cc_enable_arenas = true; package lsst.qserv.proto; -// TODO:UJ delete when xrootd removed. ResonseSummary will need to be kept. -// Query message sent to worker -// One of these Task objects should be sent. -message TaskMsg { - // Future: might have multiple db/chunk dependencies. - optional string db = 2; - optional int32 chunkid = 3; - // repeated string scantables = 4; // obsolete - optional string user = 6; - optional int32 scanpriority = 8; - message Subchunk { - optional string database = 1; // database (unused) - repeated DbTbl dbtbl = 2; // subchunked tables - repeated int32 id = 3; // subchunk ids - message DbTbl { - required string db = 1; - required string tbl = 2; - } - } - message Fragment { - // A query fragment without "CREATE or INSERT". - // Worker should synthesize. - repeated string query = 1; - optional string resulttable = 3; - optional Subchunk subchunks = 4; // Only needed with subchunk-ed queries - - // Each fragment may only write results to one table, - // but multiple fragments may write to the same table, - // in which case the table contains a concatenation of the - // contributing fragments' rows. - } - repeated Fragment fragment = 5; - message ScanTable { - required string db = 1; - required string table = 2; - required bool lockInMemory = 3; - required int32 scanRating = 4; - } - repeated ScanTable scantable = 9; - optional uint64 queryid = 10; - optional int32 jobid = 11; - optional bool scaninteractive = 12; - optional int32 attemptcount = 13; - optional uint32 czarid = 14; - optional int32 maxtablesize_mb = 15 [default = 0]; -} - // The file-based result delivery protocol has two kinds of messages. // // 1. The summary message sent back to Czar over the XROOTD/SSI protocol: @@ -125,6 +78,7 @@ message ResponseData { // of the message. //////////////////////////////////////////////////////////////// +// &&& try to eliminate this // The completion status to be sent back with responses to the query management requests. message WorkerCommandStatus { enum Code { @@ -135,6 +89,7 @@ message WorkerCommandStatus { optional string error = 2 [default = ""]; // Optional error message (depends on the code) } +// &&& try to eliminate this message QueryManagement { enum Operation { CANCEL_AFTER_RESTART = 1; // Cancel older queries before the specified query (excluding that one). diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 0b66764d4..125d1987a 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -97,7 +97,6 @@ Executive::Executive(ExecutiveConfig const& c, shared_ptr c _qMeta(qStatus), _querySession(querySession) { _secondsBetweenQMetaUpdates = chrono::seconds(_config.secondsBetweenChunkUpdates); - //&&&_setup(); _setupLimit(); qdisp::CzarStats::get()->addQuery(); } @@ -524,13 +523,6 @@ string Executive::getProgressDesc() const { return msg_progress; } -/* &&& -void Executive::_setup() { - _empty.store(true); - _requestCount = 0; -} -*/ - /** Add (jobId,r) entry to _requesters map if not here yet * else leave _requesters untouched. * diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 33ff15de4..5c4beba84 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -54,9 +54,6 @@ // TODO:UJ replace with better enable/disable feature, or just use only UberJobs #define uberJobsEnabled 1 // &&& delete -//&&& // Forward declarations -//&&&class XrdSsiService; - namespace lsst::qserv { namespace ccontrol { @@ -176,8 +173,6 @@ class Executive : public std::enable_shared_from_this { /// @return true if cancelled bool getCancelled() { return _cancelled; } - //&&&XrdSsiService* getXrdSsiService() { return _xrdSsiService; } - std::shared_ptr getQdispPool() { return _qdispPool; } /// Add 'rowCount' to the total number of rows in the result table. @@ -233,7 +228,6 @@ class Executive : public std::enable_shared_from_this { SharedResources::Ptr const& sharedResources, std::shared_ptr const& qStatus, std::shared_ptr const& querySession); - //&&&void _setup(); void _setupLimit(); bool _track(int refNum, std::shared_ptr const& r); @@ -261,11 +255,6 @@ class Executive : public std::enable_shared_from_this { std::atomic _empty{true}; std::shared_ptr _messageStore; ///< MessageStore for logging - /* &&& - /// RPC interface, static to avoid getting every time a user query starts and separate - /// from _xrdSsiService to avoid conflicts with XrdSsiServiceMock. - XrdSsiService* _xrdSsiService; ///< RPC interface - */ JobMap _jobMap; ///< Contains information about all jobs. JobMap _incompleteJobs; ///< Map of incomplete jobs. /// How many jobs are used in this query. 1 avoids possible 0 of 0 jobs completed race condition. diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index d793e3699..1a4239457 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -59,13 +59,12 @@ class JobBase : public std::enable_shared_from_this { virtual UberJobId getJobId() const = 0; virtual std::string const& getIdStr() const = 0; virtual std::shared_ptr getQdispPool() = 0; - virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for xrootd + //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for xrootd virtual std::shared_ptr getRespHandler() = 0; virtual std::shared_ptr getStatus() = 0; virtual bool getScanInteractive() const = 0; virtual bool isQueryCancelled() = 0; virtual void callMarkCompleteFunc(bool success) = 0; - //&&&virtual void setQueryRequest(std::shared_ptr const& qr) = 0; virtual std::shared_ptr getExecutive() = 0; virtual std::ostream& dumpOS(std::ostream& os) const; diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index 50c05c39e..ad8d3e62b 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -33,7 +33,6 @@ #include "lsst/log/Log.h" // Qserv headers -#include "proto/ProtoImporter.h" #include "proto/worker.pb.h" #include "util/Bug.h" #include "qdisp/Executive.h" @@ -65,19 +64,6 @@ JobDescription::JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, R _chunkResultName(chunkResultName), _mock(mock) {} -bool JobDescription::incrAttemptCountScrubResults() { // TODO:UJ delete - if (_attemptCount >= 0) { - _respHandler->prepScrubResults(_jobId, _attemptCount); // Registers the job-attempt as invalid - } - ++_attemptCount; - if (_attemptCount > MAX_JOB_ATTEMPTS) { - LOGS(_log, LOG_LVL_ERROR, "attemptCount greater than maximum number of retries " << _attemptCount); - return false; - } - buildPayload(); - return true; -} - bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase) { if (increase) { ++_attemptCount; @@ -111,28 +97,12 @@ bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr return true; } -void JobDescription::buildPayload() { - ostringstream os; - _taskMsgFactory->serializeMsg(*_chunkQuerySpec, _chunkResultName, _queryId, _jobId, _attemptCount, - _czarId, os); - _payloads[_attemptCount] = os.str(); -} - -bool JobDescription::verifyPayload() const { // TODO:UJ delete - proto::ProtoImporter pi; - if (!_mock && !pi.messageAcceptable(_payloads.at(_attemptCount))) { - LOGS(_log, LOG_LVL_DEBUG, _qIdStr << " Error serializing TaskMsg."); - return false; - } - return true; -} - bool JobDescription::getScanInteractive() const { return _chunkQuerySpec->scanInteractive; } int JobDescription::getScanRating() const { return _chunkQuerySpec->scanInfo.scanRating; } ostream& operator<<(ostream& os, JobDescription const& jd) { - os << "job(id=" << jd._jobId << " payloads.size=" << jd._payloads.size() << " ru=" << jd._resource.path() + os << "job(id=" << jd._jobId << " ru=" << jd._resource.path() << " attemptCount=" << jd._attemptCount << ")"; return os; } diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index 8c61f9d65..fccc36d85 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -77,28 +77,21 @@ class JobDescription { JobDescription(JobDescription const&) = delete; JobDescription& operator=(JobDescription const&) = delete; - void buildPayload(); ///< Must be run after construction to avoid problems with unit tests. JobId id() const { return _jobId; } ResourceUnit const& resource() const { return _resource; } - std::string const& payload() { return _payloads[_attemptCount]; } std::shared_ptr respHandler() { return _respHandler; } int getAttemptCount() const { return _attemptCount; } bool getScanInteractive() const; int getScanRating() const; - /// @returns true when _attemptCount is incremented correctly and the payload is built. - /// If the starting value of _attemptCount was greater than or equal to zero, that - /// attempt is scrubbed from the result table. - bool incrAttemptCountScrubResults(); // TODO:UJ - to be deleted /// Increase the attempt count by 1 and return false if that puts it over the limit. /// TODO:UJ scrubbing results unneeded with uj. This should be renamed. bool incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase); - bool verifyPayload() const; ///< @return true if the payload is acceptable to protobufs. std::shared_ptr getJsForWorker() { return _jsForWorker; } - void resetJsForWorker() { _jsForWorker.reset(); } // TODO:UJ may need mutex for _jsForWorker + void resetJsForWorker() { _jsForWorker.reset(); } // TODO:UJ may need mutex for _jsForWorker //&&& friend std::ostream& operator<<(std::ostream& os, JobDescription const& jd); @@ -116,12 +109,6 @@ class JobDescription { int _attemptCount{-1}; ///< Start at -1 so that first attempt will be 0, see incrAttemptCount(). ResourceUnit _resource; ///< path, e.g. /q/LSST/23125 - /// _payloads - encoded requests, one per attempt. No guarantee that xrootd is done - /// with the payload buffer, so hang onto all of them until the query is finished. - /// Also, using a map so the strings wont be moved. - /// The xrootd callback function QueryRequest::GetRequest should - /// return something other than a char*. - std::map _payloads; std::shared_ptr _respHandler; // probably MergingHandler std::shared_ptr _taskMsgFactory; std::shared_ptr _chunkQuerySpec; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 5d6c6193a..7245e60a2 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -63,7 +63,7 @@ JobQuery::~JobQuery() { } /// Cancel response handling. Return true if this is the first time cancel has been called. -bool JobQuery::cancel(bool superfluous) { +bool JobQuery::cancel(bool superfluous) { /// &&& This can probably be simplified more QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel()"); if (_cancelled.exchange(true) == false) { @@ -71,17 +71,7 @@ bool JobQuery::cancel(bool superfluous) { // If _inSsi is true then this query request has been passed to SSI and // _queryRequestPtr cannot be a nullptr. Cancellation is complicated. bool cancelled = false; - /* &&& - if (_inSsi) { - LOGS(_log, LOG_LVL_DEBUG, "cancel QueryRequest in progress"); - if (_queryRequestPtr->cancel()) { - LOGS(_log, LOG_LVL_DEBUG, "cancelled by QueryRequest"); - cancelled = true; - } else { - LOGS(_log, LOG_LVL_DEBUG, "QueryRequest could not cancel"); - } - } - */ + if (!cancelled) { ostringstream os; os << _idStr << " cancel"; @@ -155,7 +145,7 @@ int JobQuery::getAttemptCount() const { return _jobDescription->getAttemptCount(); } -string const& JobQuery::getPayload() const { return _jobDescription->payload(); } +//&&&string const& JobQuery::getPayload() const { return _jobDescription->payload(); } void JobQuery::callMarkCompleteFunc(bool success) { _markCompleteFunc->operator()(success); } diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index 7ce262875..23fa4fc86 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -64,7 +64,7 @@ class JobQuery : public JobBase { QueryId getQueryId() const override { return _qid; } JobId getJobId() const override { return _jobDescription->id(); } - std::string const& getPayload() const override; + //&&&std::string const& getPayload() const override; std::string const& getIdStr() const override { return _idStr; } std::shared_ptr getRespHandler() override { return _jobDescription->respHandler(); } bool getScanInteractive() const override { return _jobDescription->getScanInteractive(); } diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 9afdb4f75..9440380a5 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -35,7 +35,6 @@ #include "global/LogContext.h" #include "http/Client.h" #include "http/MetaModule.h" -#include "proto/ProtoImporter.h" #include "proto/worker.pb.h" #include "qdisp/JobQuery.h" #include "qmeta/JobStatus.h" diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 04c59c807..9a5ed13ab 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -35,8 +35,6 @@ namespace lsst::qserv::qdisp { class JobQuery; -class QueryRequest; - /// This class is a contains x number of jobs that need to go to the same worker /// from a single user query, and contact information for the worker. It also holds /// some information common to all jobs. @@ -70,7 +68,7 @@ class UberJob : public JobBase { } // TODO:UJ change name when JobBase no longer needed. std::string const& getIdStr() const override { return _idStr; } std::shared_ptr getQdispPool() override { return _qdispPool; } // TODO:UJ relocate to JobBase - std::string const& getPayload() const override { return _payload; } // TODO:UJ delete when possible. + //&&&std::string const& getPayload() const override { return _payload; } // TODO:UJ delete when possible. std::shared_ptr getRespHandler() override { return _respHandler; } std::shared_ptr getStatus() override { return _jobStatus; diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 4d539fa93..b750af776 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -64,10 +64,12 @@ namespace lsst::qserv::qproc { class MockTaskMsgFactory : public TaskMsgFactory { public: MockTaskMsgFactory(std::string const& mockPayload_) : TaskMsgFactory(), mockPayload(mockPayload_) {} + /* &&& void serializeMsg(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, int jobId, int attemptCount, qmeta::CzarId czarId, std::ostream& os) override { os << mockPayload; } + */ std::shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, int jobId, int attemptCount, diff --git a/src/qproc/TaskMsgFactory.cc b/src/qproc/TaskMsgFactory.cc index 8a2d7434d..bf2018669 100644 --- a/src/qproc/TaskMsgFactory.cc +++ b/src/qproc/TaskMsgFactory.cc @@ -56,164 +56,16 @@ using namespace std; namespace lsst::qserv::qproc { -// TODO:UJ - Probaly just delete this -bool TaskMsgFactory::fillTaskMsg(proto::TaskMsg* taskMsg, ChunkQuerySpec const& chunkQuerySpec, - std::string const& chunkResultName, QueryId queryId, int jobId, - int attemptCount, qmeta::CzarId czarId) { - std::string resultTable("Asdfasfd"); - if (!chunkResultName.empty()) { - resultTable = chunkResultName; - } - // shared - taskMsg->set_db(chunkQuerySpec.db); - taskMsg->set_queryid(queryId); - taskMsg->set_jobid(jobId); - taskMsg->set_attemptcount(attemptCount); - taskMsg->set_czarid(czarId); - - // scanTables (for shared scans) - // check if more than 1 db in scanInfo - std::string db; - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { - if (db.empty()) { - db = sTbl.db; - } - } - - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { - lsst::qserv::proto::TaskMsg_ScanTable* msgScanTbl = taskMsg->add_scantable(); - sTbl.copyToScanTable(msgScanTbl); - } - - taskMsg->set_scanpriority(chunkQuerySpec.scanInfo.scanRating); - taskMsg->set_scaninteractive(chunkQuerySpec.scanInteractive); - - // per-chunk - taskMsg->set_chunkid(chunkQuerySpec.chunkId); - // per-fragment - // TODO refactor to simplify - if (chunkQuerySpec.nextFragment.get()) { - ChunkQuerySpec const* sPtr = &chunkQuerySpec; - while (sPtr) { - LOGS(_log, LOG_LVL_TRACE, "nextFragment"); - for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (sPtr->queries).at(t)); - } - // Linked fragments will not have valid subChunkTables vectors, - // So, we reuse the root fragment's vector. - _addFragment(*taskMsg, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, - sPtr->queries); - sPtr = sPtr->nextFragment.get(); - } - } else { - LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); - } - _addFragment(*taskMsg, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, - chunkQuerySpec.queries); - } - return true; -} - -std::shared_ptr TaskMsgFactory::_makeMsg(ChunkQuerySpec const& chunkQuerySpec, - std::string const& chunkResultName, QueryId queryId, - int jobId, int attemptCount, qmeta::CzarId czarId) { - std::string resultTable("Asdfasfd"); - if (!chunkResultName.empty()) { - resultTable = chunkResultName; - } - auto taskMsg = std::make_shared(); - // shared - taskMsg->set_db(chunkQuerySpec.db); - taskMsg->set_queryid(queryId); - taskMsg->set_jobid(jobId); - taskMsg->set_attemptcount(attemptCount); - taskMsg->set_czarid(czarId); - // scanTables (for shared scans) - // check if more than 1 db in scanInfo - std::string db; - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { - if (db.empty()) { - db = sTbl.db; - } - } - - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { - lsst::qserv::proto::TaskMsg_ScanTable* msgScanTbl = taskMsg->add_scantable(); - sTbl.copyToScanTable(msgScanTbl); - } - - taskMsg->set_scanpriority(chunkQuerySpec.scanInfo.scanRating); - taskMsg->set_scaninteractive(chunkQuerySpec.scanInteractive); - taskMsg->set_maxtablesize_mb(cconfig::CzarConfig::instance()->getMaxTableSizeMB()); - - // per-chunk - taskMsg->set_chunkid(chunkQuerySpec.chunkId); - // per-fragment - // TODO refactor to simplify - if (chunkQuerySpec.nextFragment.get()) { - ChunkQuerySpec const* sPtr = &chunkQuerySpec; - while (sPtr) { - LOGS(_log, LOG_LVL_TRACE, "nextFragment"); - for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (sPtr->queries).at(t)); - } - // Linked fragments will not have valid subChunkTables vectors, - // So, we reuse the root fragment's vector. - _addFragment(*taskMsg, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, - sPtr->queries); - sPtr = sPtr->nextFragment.get(); - } - } else { - LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); - } - _addFragment(*taskMsg, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, - chunkQuerySpec.queries); - } - return taskMsg; -} - -void TaskMsgFactory::_addFragment(proto::TaskMsg& taskMsg, std::string const& resultName, - DbTableSet const& subChunkTables, std::vector const& subChunkIds, - std::vector const& queries) { - proto::TaskMsg::Fragment* frag = taskMsg.add_fragment(); - frag->set_resulttable(resultName); - - for (auto& qry : queries) { - frag->add_query(qry); - } - - proto::TaskMsg_Subchunk sc; - - // Add the db+table pairs to the subchunk. - for (auto& tbl : subChunkTables) { - proto::TaskMsg_Subchunk_DbTbl* dbTbl = sc.add_dbtbl(); - dbTbl->set_db(tbl.db); - dbTbl->set_tbl(tbl.table); - LOGS(_log, LOG_LVL_TRACE, "added dbtbl=" << tbl.db << "." << tbl.table); - } - - for (auto& subChunkId : subChunkIds) { - sc.add_id(subChunkId); - } - - frag->mutable_subchunks()->CopyFrom(sc); -} - -void TaskMsgFactory::serializeMsg(ChunkQuerySpec const& s, std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, qmeta::CzarId czarId, - std::ostream& os) { - std::shared_ptr m = _makeMsg(s, chunkResultName, queryId, jobId, attemptCount, czarId); - m->SerializeToOstream(&os); -} - std::shared_ptr TaskMsgFactory::makeMsgJson(ChunkQuerySpec const& chunkQuerySpec, std::string const& chunkResultName, QueryId queryId, int jobId, int attemptCount, qmeta::CzarId czarId) { + // TODO:UJ DM-45384 &&& remove duplicate elements from the json message + // TODO:UJ &&& see: JobDescription::incrAttemptCountScrubResultsJson + // TODO:UJ &&& see: wbase::UberJobData::create + // TODO:UJ &&& see: Task::createTasksForChunk + // TODO:UJ &&& see: wdb/testQueryRunner.cc + // TODO:UJ &&& see: wsched/testSchedulers.cc std::string resultTable("Asdfasfd"); if (!chunkResultName.empty()) { resultTable = chunkResultName; diff --git a/src/qproc/TaskMsgFactory.h b/src/qproc/TaskMsgFactory.h index d770d2c5c..1e3bfd3be 100644 --- a/src/qproc/TaskMsgFactory.h +++ b/src/qproc/TaskMsgFactory.h @@ -50,6 +50,7 @@ class ChunkQuerySpec; /// TaskMsgFactory is a factory for TaskMsg (protobuf) objects. /// All member variables must be thread safe. +/// &&& fix doc class TaskMsgFactory { public: using Ptr = std::shared_ptr; @@ -57,31 +58,12 @@ class TaskMsgFactory { TaskMsgFactory() = default; virtual ~TaskMsgFactory() {} - /// Construct a TaskMsg and serialize it to a stream - virtual void serializeMsg(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, - int jobId, int attemptCount, qmeta::CzarId czarId, std::ostream& os); - - /// Use the provided information to fill in taskMsg. - /// @return true if successful. - bool fillTaskMsg(proto::TaskMsg* taskMsg, ChunkQuerySpec const& s, std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, qmeta::CzarId czarId); - /// Make and return the json message for a single Job. virtual std::shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, int jobId, int attemptCount, qmeta::CzarId czarId); private: - // TODO:UJ delete when possible - std::shared_ptr _makeMsg(ChunkQuerySpec const& s, std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, - qmeta::CzarId czarId); - - // TODO:UJ delete when possible - void _addFragment(proto::TaskMsg& taskMsg, std::string const& resultName, - DbTableSet const& subChunkTables, std::vector const& subChunkIds, - std::vector const& queries); - /// Make a json message for a single fragment. void _addFragmentJson(nlohmann::json& jsFragments, std::string const& resultName, DbTableSet const& subChunkTables, std::vector const& subChunkIds, diff --git a/src/rproc/InfileMerger.cc b/src/rproc/InfileMerger.cc index 11cb77cdd..4d32d3ad9 100644 --- a/src/rproc/InfileMerger.cc +++ b/src/rproc/InfileMerger.cc @@ -56,7 +56,6 @@ // Qserv headers #include "cconfig/CzarConfig.h" #include "global/intTypes.h" -#include "proto/ProtoImporter.h" #include "proto/worker.pb.h" #include "qdisp/CzarStats.h" #include "qdisp/Executive.h" diff --git a/src/rproc/testProtoRowBuffer.cc b/src/rproc/testProtoRowBuffer.cc index 9ed815da1..99a97ec1a 100644 --- a/src/rproc/testProtoRowBuffer.cc +++ b/src/rproc/testProtoRowBuffer.cc @@ -26,7 +26,6 @@ // Qserv headers #include "proto/worker.pb.h" -#include "proto/FakeProtocolFixture.h" // Boost unit test header #define BOOST_TEST_MODULE ProtoRowBuffer_1 diff --git a/src/wbase/MsgProcessor.h b/src/wbase/MsgProcessor.h index 8458dc3f4..8b48de7ec 100644 --- a/src/wbase/MsgProcessor.h +++ b/src/wbase/MsgProcessor.h @@ -42,21 +42,21 @@ class WorkerCommand; namespace lsst::qserv::wbase { /// MsgProcessor implementations handle incoming Task objects. -struct MsgProcessor { +struct MsgProcessor { // &&& delete file if possible virtual ~MsgProcessor() {} /// Process a group of query processing tasks. - virtual void processTasks(std::vector> const& tasks) = 0; + virtual void processTasks(std::vector> const& tasks) = 0; // &&& delete /// Process a managememt command - virtual void processCommand(std::shared_ptr const& command) = 0; + virtual void processCommand(std::shared_ptr const& command) = 0; // &&& can this be deleted /** * Retreive the status of queries being processed by the worker. * @param taskSelector Task selection criterias. * @return a JSON representation of the object's status for the monitoring */ - virtual nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector) = 0; + virtual nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector) = 0; // &&& can this be deleted }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 0448a6af7..cc7c1668f 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -71,16 +71,6 @@ namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.Task"); -string buildResultFilePath(shared_ptr const& taskMsg, - string const& resultsDirname) { - if (resultsDirname.empty()) return resultsDirname; - fs::path path(resultsDirname); - path /= to_string(taskMsg->czarid()) + "-" + to_string(taskMsg->queryid()) + "-" + - to_string(taskMsg->jobid()) + "-" + to_string(taskMsg->chunkid()) + "-" + - to_string(taskMsg->attemptcount()) + ".proto"; - return path.string(); -} - string buildUjResultFilePath(lsst::qserv::wbase::UberJobData::Ptr const& ujData, string const& resultsDirname) { if (resultsDirname.empty()) return resultsDirname; @@ -126,103 +116,6 @@ TaskScheduler::TaskScheduler() { atomic taskSequence{0}; ///< Unique identifier source for Task. -/// When the constructor is called, there is not enough information -/// available to define the action to take when this task is run, so -/// Command::setFunc() is used set the action later. This is why -/// the util::CommandThreadPool is not called here. -Task::Task(TaskMsgPtr const& t, int fragmentNumber, shared_ptr const& userQueryInfo, - size_t templateId, int subchunkId, shared_ptr const& sc, - uint16_t resultsHttpPort) - : _userQueryInfo(userQueryInfo), - _sendChannel(sc), - _tSeq(++taskSequence), - _qId(t->queryid()), - _templateId(templateId), - _hasChunkId(t->has_chunkid()), - _chunkId(t->has_chunkid() ? t->chunkid() : -1), - _subchunkId(subchunkId), - _jId(t->jobid()), - _attemptCount(t->attemptcount()), - _queryFragmentNum(fragmentNumber), - _fragmentHasSubchunks(t->fragment(fragmentNumber).has_subchunks()), - _db(t->has_db() ? t->db() : ""), - _czarId(t->has_czarid() ? t->czarid() : -1) { - // These attributes will be passed back to Czar in the Protobuf response - // to advice which result delivery channel to use. - auto const workerConfig = wconfig::WorkerConfig::instance(); - auto const resultDeliveryProtocol = workerConfig->resultDeliveryProtocol(); - _resultFilePath = ::buildResultFilePath(t, workerConfig->resultsDirname()); - auto const fqdn = util::get_current_host_fqdn(); - if (resultDeliveryProtocol == wconfig::ConfigValResultDeliveryProtocol::XROOT) { - // NOTE: one extra '/' after the [:] spec is required to make - // a "valid" XROOTD url. - _resultFileXrootUrl = "xroot://" + fqdn + ":" + to_string(workerConfig->resultsXrootdPort()) + "/" + - _resultFilePath; - } else if (resultDeliveryProtocol == wconfig::ConfigValResultDeliveryProtocol::HTTP) { - _resultFileHttpUrl = "http://" + fqdn + ":" + to_string(resultsHttpPort) + _resultFilePath; - } else { - throw runtime_error("wbase::Task::Task: unsupported results delivery protocol: " + - wconfig::ConfigValResultDeliveryProtocol::toString(resultDeliveryProtocol)); - } - if (t->has_user()) { - user = t->user(); - } else { - user = defaultUser; - } - - // Determine which major tables this task will use. - int const size = t->scantable_size(); - for (int j = 0; j < size; ++j) { - _scanInfo.infoTables.push_back(proto::ScanTableInfo(t->scantable(j))); - } - _scanInfo.scanRating = t->scanpriority(); - _scanInfo.sortTablesSlowestFirst(); - _scanInteractive = t->scaninteractive(); - _maxTableSize = t->maxtablesize_mb() * ::MB_SIZE_BYTES; - - // Create sets and vectors for 'aquiring' subchunk temporary tables. - proto::TaskMsg_Fragment const& fragment(t->fragment(_queryFragmentNum)); - DbTableSet dbTbls_; - IntVector subchunksVect_; - if (!_fragmentHasSubchunks) { - /// FUTURE: Why acquire anything if there are no subchunks in the fragment? - /// This branch never seems to happen, but this needs to be proven beyond any doubt. - LOGS(_log, LOG_LVL_WARN, "Task::Task not _fragmentHasSubchunks"); - for (auto const& scanTbl : t->scantable()) { - dbTbls_.emplace(scanTbl.db(), scanTbl.table()); - LOGS(_log, LOG_LVL_INFO, - "Task::Task scanTbl.db()=" << scanTbl.db() << " scanTbl.table()=" << scanTbl.table()); - } - LOGS(_log, LOG_LVL_INFO, - "fragment a db=" << _db << ":" << _chunkId << " dbTbls=" << util::printable(dbTbls_)); - } else { - proto::TaskMsg_Subchunk const& sc = fragment.subchunks(); - for (int j = 0; j < sc.dbtbl_size(); j++) { - /// Different subchunk fragments can require different tables. - /// FUTURE: It may save space to store these in UserQueryInfo as it seems - /// database and table names are consistent across chunks. - dbTbls_.emplace(sc.dbtbl(j).db(), sc.dbtbl(j).tbl()); - LOGS(_log, LOG_LVL_TRACE, - "Task::Task subchunk j=" << j << " sc.dbtbl(j).db()=" << sc.dbtbl(j).db() - << " sc.dbtbl(j).tbl()=" << sc.dbtbl(j).tbl()); - } - IntVector sVect(sc.id().begin(), sc.id().end()); - subchunksVect_ = sVect; - if (sc.has_database()) { - _db = sc.database(); - } else { - _db = t->db(); - } - LOGS(_log, LOG_LVL_DEBUG, - "fragment b db=" << _db << ":" << _chunkId << " dbTableSet" << util::printable(dbTbls_) - << " subChunks=" << util::printable(subchunksVect_)); - } - _dbTblsAndSubchunks = make_unique(dbTbls_, subchunksVect_); - if (_sendChannel == nullptr) { - throw util::Bug(ERR_LOC, "Task::Task _sendChannel==null " + getIdStr()); - } -} - /// When the constructor is called, there is not enough information /// available to define the action to take when this task is run, so /// Command::setFunc() is used set the action later. This is why @@ -305,53 +198,6 @@ Task::~Task() { } } -vector Task::createTasks(shared_ptr const& taskMsg, - shared_ptr const& sendChannel, - shared_ptr const& chunkResourceMgr, - mysql::MySqlConfig const& mySqlConfig, - shared_ptr const& sqlConnMgr, - shared_ptr const& queriesAndChunks, - uint16_t resultsHttpPort) { - QueryId qId = taskMsg->queryid(); - QSERV_LOGCONTEXT_QUERY_JOB(qId, taskMsg->jobid()); - vector vect; - - UserQueryInfo::Ptr userQueryInfo = UserQueryInfo::uqMapInsert(qId); - - /// Make one task for each fragment. - int fragmentCount = taskMsg->fragment_size(); - if (fragmentCount < 1) { - throw util::Bug(ERR_LOC, "Task::createTasks No fragments to execute in TaskMsg"); - } - - string const chunkIdStr = to_string(taskMsg->chunkid()); - for (int fragNum = 0; fragNum < fragmentCount; ++fragNum) { - proto::TaskMsg_Fragment const& fragment = taskMsg->fragment(fragNum); - for (string queryStr : fragment.query()) { - size_t templateId = userQueryInfo->addTemplate(queryStr); - if (fragment.has_subchunks() && not fragment.subchunks().id().empty()) { - for (auto subchunkId : fragment.subchunks().id()) { - auto task = make_shared(taskMsg, fragNum, userQueryInfo, templateId, - subchunkId, sendChannel, resultsHttpPort); - vect.push_back(task); - } - } else { - int subchunkId = -1; // there are no subchunks. - auto task = make_shared(taskMsg, fragNum, userQueryInfo, templateId, subchunkId, - sendChannel, resultsHttpPort); - vect.push_back(task); - } - } - } - for (auto task : vect) { - // newQueryRunner sets the `_taskQueryRunner` pointer in `task`. - task->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(task, chunkResourceMgr, mySqlConfig, - sqlConnMgr, queriesAndChunks)); - } - sendChannel->setTaskCount(vect.size()); - - return vect; -} std::vector Task::createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, diff --git a/src/wbase/Task.h b/src/wbase/Task.h index f88238ef2..460a31c06 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -51,10 +51,7 @@ namespace lsst::qserv::mysql { class MySqlConfig; } -namespace lsst::qserv::proto { -class TaskMsg; -class TaskMsg_Fragment; -} // namespace lsst::qserv::proto + namespace lsst::qserv::wbase { class FileChannelShared; } @@ -144,7 +141,6 @@ class Task : public util::CommandForThreadPool { public: static std::string const defaultUser; using Ptr = std::shared_ptr; - using TaskMsgPtr = std::shared_ptr; /// Class to store constant sets and vectors. class DbTblsAndSubchunks { @@ -171,9 +167,6 @@ class Task : public util::CommandForThreadPool { bool operator()(Ptr const& x, Ptr const& y); }; - Task(TaskMsgPtr const& t, int fragmentNumber, std::shared_ptr const& userQueryInfo, - size_t templateId, int subchunkId, std::shared_ptr const& sc, - uint16_t resultsHttpPort = 8080); // TODO:UJ too many parameters. // - fragmentNumber seems pointless // - hasSubchunks seems redundant. @@ -191,15 +184,6 @@ class Task : public util::CommandForThreadPool { Task(const Task&) = delete; virtual ~Task(); - /// Read 'taskMsg' to generate a vector of one or more task objects all using the same 'sendChannel' - static std::vector createTasks(std::shared_ptr const& taskMsg, - std::shared_ptr const& sendChannel, - std::shared_ptr const& chunkResourceMgr, - mysql::MySqlConfig const& mySqlConfig, - std::shared_ptr const& sqlConnMgr, - std::shared_ptr const& queriesAndChunks, - uint16_t resultsHttpPort = 8080); - /// Read json to generate a vector of one or more task for a chunk. static std::vector createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index d969b80b7..64538fc6c 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -81,8 +81,16 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount funcN << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize << " headerCount=" << headerCount); + string workerIdStr; + if (_foreman != nullptr) { + workerIdStr = _foreman->chunkInventory()->id(); + } else { + workerIdStr = "dummyWorkerIdStr"; + LOGS(_log, LOG_LVL_INFO, funcN << " _foreman was null, which should only happen in unit tests"); + } + json request = {{"version", http::MetaModule::version}, - {"workerid", _foreman->chunkInventory()->id()}, + {"workerid", workerIdStr}, {"auth_key", _authKey}, {"czar", _czarName}, {"czarid", _czarId}, diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index 17fd0f14f..6fe5ca439 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -128,11 +128,11 @@ class Foreman : public wbase::MsgProcessor { /// Process a group of query processing tasks. /// @see MsgProcessor::processTasks() - void processTasks(std::vector> const& tasks) override; + void processTasks(std::vector> const& tasks) override; // &&& delete /// Implement the corresponding method of the base class /// @see MsgProcessor::processCommand() - void processCommand(std::shared_ptr const& command) override; + void processCommand(std::shared_ptr const& command) override; // &&& delete /// Implement the corresponding method of the base class /// @see MsgProcessor::statusToJson() diff --git a/src/wdb/CMakeLists.txt b/src/wdb/CMakeLists.txt index 552dda93e..6120078f8 100644 --- a/src/wdb/CMakeLists.txt +++ b/src/wdb/CMakeLists.txt @@ -4,7 +4,6 @@ add_dependencies(wdb proto) target_sources(wdb PRIVATE ChunkResource.cc QueryRunner.cc - QuerySql.cc SQLBackend.cc ) @@ -36,7 +35,6 @@ ENDFUNCTION() wdb_tests( testChunkResource testQueryRunner - testQuerySql ) set_tests_properties(testQueryRunner PROPERTIES WILL_FAIL 1) diff --git a/src/wdb/ChunkResource.cc b/src/wdb/ChunkResource.cc index b131552e2..a6eb90ef4 100644 --- a/src/wdb/ChunkResource.cc +++ b/src/wdb/ChunkResource.cc @@ -48,32 +48,12 @@ #include "util/Bug.h" #include "util/IterableFormatter.h" #include "wbase/Base.h" -#include "wdb/QuerySql.h" +//&&&#include "wdb/QuerySql.h" namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.wdb.ChunkResource"); -template -class ScScriptBuilder { -public: - ScScriptBuilder(lsst::qserv::wdb::QuerySql& qSql_, std::string const& db, std::string const& table, - std::string const& scColumn, int chunkId) - : qSql(qSql_) { - buildT = (boost::format(lsst::qserv::wbase::CREATE_SUBCHUNK_SCRIPT) % db % table % scColumn % - chunkId % "%1%") - .str(); - cleanT = (boost::format(lsst::qserv::wbase::CLEANUP_SUBCHUNK_SCRIPT) % db % table % chunkId % "%1%") - .str(); - } - void operator()(T const& subc) { - qSql.buildList.push_back((boost::format(buildT) % subc).str()); - qSql.cleanupList.push_back((boost::format(cleanT) % subc).str()); - } - std::string buildT; - std::string cleanT; - lsst::qserv::wdb::QuerySql& qSql; -}; } // anonymous namespace namespace lsst::qserv::wdb { diff --git a/src/wdb/QuerySql.cc b/src/wdb/QuerySql.cc deleted file mode 100644 index 7dd127957..000000000 --- a/src/wdb/QuerySql.cc +++ /dev/null @@ -1,133 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2012-2015 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -/** - * @file - * - * @brief QuerySql is a bundle of SQL statements that represent an accepted - * query's generated SQL. - * - * FIXME: Unfinished infrastructure for passing subchunk table name to worker. - * - * @author Daniel L. Wang, SLAC - */ - -// Class header -#include "wdb/QuerySql.h" - -// System headers -#include - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "global/constants.h" -#include "global/DbTable.h" -#include "proto/worker.pb.h" -#include "wbase/Base.h" - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.wdb.QuerySql"); - -template -class ScScriptBuilder { -public: - ScScriptBuilder(lsst::qserv::wdb::QuerySql& qSql_, std::string const& db, std::string const& table, - std::string const& scColumn, int chunkId) - : qSql(qSql_) { - buildT = (boost::format(lsst::qserv::wbase::CREATE_SUBCHUNK_SCRIPT) % db % table % scColumn % - chunkId % "%1%") - .str(); - cleanT = (boost::format(lsst::qserv::wbase::CLEANUP_SUBCHUNK_SCRIPT) % db % table % chunkId % "%1%") - .str(); - } - void operator()(T const& subc) { - qSql.buildList.push_back((boost::format(buildT) % subc).str()); - qSql.cleanupList.push_back((boost::format(cleanT) % subc).str()); - } - std::string buildT; - std::string cleanT; - lsst::qserv::wdb::QuerySql& qSql; -}; -} // anonymous namespace - -namespace lsst::qserv::wdb { - -//////////////////////////////////////////////////////////////////////// -// QuerySql ostream friend -//////////////////////////////////////////////////////////////////////// -std::ostream& operator<<(std::ostream& os, QuerySql const& q) { - os << "QuerySql(bu="; - std::copy(q.buildList.begin(), q.buildList.end(), std::ostream_iterator(os, ",")); - os << "; ex="; - std::copy(q.executeList.begin(), q.executeList.end(), std::ostream_iterator(os, ",")); - os << "; cl="; - std::copy(q.cleanupList.begin(), q.cleanupList.end(), std::ostream_iterator(os, ",")); - os << ")"; - return os; -} - -//////////////////////////////////////////////////////////////////////// -// QuerySql constructor -//////////////////////////////////////////////////////////////////////// -QuerySql::QuerySql(std::string const& db, int chunkId, proto::TaskMsg_Fragment const& f, bool needCreate, - std::string const& defaultResultTable) { - std::string resultTable; - if (f.has_resulttable()) { - resultTable = f.resulttable(); - } else { - resultTable = defaultResultTable; - } - assert(!resultTable.empty()); - - // Create executable statement. - // Obsolete when results marshalling is implemented - std::stringstream ss; - for (int i = 0; i < f.query_size(); ++i) { - if (needCreate) { - ss << "CREATE TABLE " + resultTable + " "; - needCreate = false; - } else { - ss << "INSERT INTO " + resultTable + " "; - } - ss << f.query(i); - executeList.push_back(ss.str()); - ss.str(""); - } - - if (f.has_subchunks()) { - proto::TaskMsg_Subchunk const& sc = f.subchunks(); - for (int i = 0; i < sc.dbtbl_size(); ++i) { - DbTable dbTable(sc.dbtbl(i).db(), sc.dbtbl(i).tbl()); - LOGS(_log, LOG_LVL_DEBUG, "Building subchunks for table=" << dbTable << " chunkId=" << chunkId); - ScScriptBuilder scb(*this, dbTable.db, dbTable.table, SUB_CHUNK_COLUMN, chunkId); - for (int i = 0; i < sc.id_size(); ++i) { - scb(sc.id(i)); - } - } - } -} - -} // namespace lsst::qserv::wdb diff --git a/src/wdb/QuerySql.h b/src/wdb/QuerySql.h deleted file mode 100644 index cfc2e48bf..000000000 --- a/src/wdb/QuerySql.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2013-2015 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_WDB_QUERYSQL_H -#define LSST_QSERV_WDB_QUERYSQL_H -/** - * @file - * - * @brief QuerySql is a bundle of SQL statements that represent an accepted - * query's generated SQL. - * - * @author Daniel L. Wang, SLAC - */ - -// System headers -#include -#include -#include -#include - -// Forward declarations -namespace lsst::qserv { -namespace proto { -class TaskMsg_Fragment; -} -namespace wdb { -class Task; -} -} // namespace lsst::qserv - -namespace lsst::qserv::wdb { - -class QuerySql { -public: - typedef std::shared_ptr Ptr; - typedef std::deque StringDeque; - typedef lsst::qserv::proto::TaskMsg_Fragment Fragment; - - QuerySql() {} - QuerySql(std::string const& db, int chunkId, proto::TaskMsg_Fragment const& f, bool needCreate, - std::string const& defaultResultTable); - - StringDeque buildList; - StringDeque executeList; // Consider using SqlFragmenter to break this up into fragments. - StringDeque cleanupList; - struct Batch; - friend std::ostream& operator<<(std::ostream& os, QuerySql const& q); -}; - -} // namespace lsst::qserv::wdb - -#endif // LSST_QSERV_WDB_QUERYSQL_H diff --git a/src/wdb/QuerySql_Batch.h b/src/wdb/QuerySql_Batch.h deleted file mode 100644 index bec278350..000000000 --- a/src/wdb/QuerySql_Batch.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2013-2014 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_WDB_QUERYSQL_BATCH_H -#define LSST_QSERV_WDB_QUERYSQL_BATCH_H -/** - * @file - * - * @brief QuerySql::Batch is the actual bundling portion of a QuerySql object. - * - * @author Daniel L. Wang, SLAC - */ - -// System headers -#include -#include - -// Local headers -#include "wdb/QuerySql.h" - -namespace lsst::qserv::wdb { - -struct QuerySql::Batch { - // Default to 10 SQL statements at a time. - // Idea: Could add statements according to some cost metric(a - // simple one) or to a certain overall query string length - Batch(std::string const& name_, QuerySql::StringDeque const& sequence_, int batchSize_ = 10) - : name(name_), batchSize(batchSize_), pos(0) { - for (QuerySql::StringDeque::const_iterator i = sequence_.begin(); i != sequence_.end(); ++i) { - std::string::const_iterator last = i->begin() + (i->length() - 1); - if (';' == *last) { // Clip trailing semicolon which - // is added during batching. - sequence.push_back(std::string(i->begin(), last)); - } else { - sequence.push_back(*i); - } - } - } - bool isDone() const { return sequence.empty() || (static_cast(pos) >= sequence.size()); } - std::string current() const { - std::ostringstream os; - QuerySql::StringDeque::const_iterator begin; - assert((unsigned)pos < sequence.size()); // caller should have checked isDone() - begin = sequence.begin() + pos; - if (sequence.size() < static_cast(pos + batchSize)) { - std::copy(begin, sequence.end(), std::ostream_iterator(os, ";\n")); - } else { - std::copy(begin, begin + batchSize, std::ostream_iterator(os, ";\n")); - } - return os.str(); - } - void next() { pos += batchSize; } - - std::string name; - QuerySql::StringDeque sequence; - QuerySql::StringDeque::size_type batchSize; - QuerySql::StringDeque::size_type pos; -}; - -} // namespace lsst::qserv::wdb - -#endif // LSST_QSERV_WDB_QUERYSQL_BATCH_H diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index 319d4252b..276beaace 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -29,10 +29,13 @@ // Qserv headers #include "mysql/MySqlConfig.h" +#include "proto/ScanTableInfo.h" #include "proto/worker.pb.h" #include "wbase/FileChannelShared.h" #include "wbase/Task.h" +#include "wbase/UberJobData.h" #include "wconfig/WorkerConfig.h" +#include "wcontrol/Foreman.h" #include "wcontrol/SqlConnMgr.h" #include "wdb/ChunkResource.h" #include "wdb/QueryRunner.h" @@ -51,9 +54,11 @@ namespace util = lsst::qserv::util; using lsst::qserv::mysql::MySqlConfig; using lsst::qserv::mysql::MySqlConnection; +/* &&& using lsst::qserv::proto::TaskMsg; using lsst::qserv::proto::TaskMsg_Fragment; using lsst::qserv::proto::TaskMsg_Subchunk; +*/ using lsst::qserv::wbase::FileChannelShared; using lsst::qserv::wbase::SendChannel; @@ -67,6 +72,7 @@ using lsst::qserv::wdb::QueryRunner; using lsst::qserv::wpublish::QueriesAndChunks; struct Fixture { + /* &&& shared_ptr newTaskMsg() { shared_ptr t = make_shared(); t->set_chunkid(3240); // hardcoded @@ -80,6 +86,107 @@ struct Fixture { f->add_query("SELECT AVG(yFlux_PS) from LSST.Object_3240"); return t; } + */ + + struct MsgInfo { + string const db = "LSST"; + string const table = "Object"; + string const qry = "SELECT AVG(yFlux_PS) from LSST.Object_3240"; + int const chunkId = 3240; + int const czarId = 5; + string const czarName = "cz5"; + string const czarHostName = "cz5host"; + int const czarPort = 3437; + string const targWorkerId = "a_worker"; + // &&& make mock foreman instead of nullptr? + std::shared_ptr foreman; + int const queryId = 23; + int const jobId = 1; + int const uberJobId = 1; + int const attemptCount = 1; + int const scanRating = 1; + bool const scanInteractive = false; + int const maxTableSize = 5000; + bool const lockInMemory = false; + string const resultName = "resName"; + string const authKey = "noAuthKey"; + }; + + shared_ptr newTaskJson(MsgInfo const& mInfo) { + // Derived from TaskMsgFactory::makeMsgJson + + auto jsJobMsgPtr = std::shared_ptr( + new nlohmann::json({{"czarId", mInfo.czarId}, + {"queryId", mInfo.queryId}, + {"jobId", mInfo.jobId}, + {"attemptCount", mInfo.attemptCount}, + {"querySpecDb", mInfo.db}, + {"scanPriority", mInfo.scanRating}, + {"scanInteractive", mInfo.scanInteractive}, + {"maxTableSize", mInfo.maxTableSize}, + {"chunkScanTables", nlohmann::json::array()}, + {"chunkId", mInfo.chunkId}, + {"queryFragments", nlohmann::json::array()}})); + + auto& jsJobMsg = *jsJobMsgPtr; + + auto& chunkScanTables = jsJobMsg["chunkScanTables"]; + /* &&& + for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { + nlohmann::json cst = {{"db", sTbl.db}, + {"table", sTbl.table}, + {"lockInMemory", sTbl.lockInMemory}, + {"tblScanRating", sTbl.scanRating}}; + chunkScanTables.push_back(move(cst)); + } + */ + nlohmann::json cst = {{"db", mInfo.db}, + {"table", mInfo.table}, + {"lockInMemory", mInfo.lockInMemory}, + {"tblScanRating", mInfo.scanRating}}; + chunkScanTables.push_back(move(cst)); + + + auto& jsFragments = jsJobMsg["queryFragments"]; + /* &&& + if (chunkQuerySpec.nextFragment.get()) { + ChunkQuerySpec const* sPtr = &chunkQuerySpec; + while (sPtr) { + LOGS(_log, LOG_LVL_TRACE, "nextFragment"); + for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { + LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); + } + for (auto const& sbi : sPtr->subChunkIds) { + LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); + } + // Linked fragments will not have valid subChunkTables vectors, + // So, we reuse the root fragment's vector. + _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, + sPtr->queries); + sPtr = sPtr->nextFragment.get(); + } + } else { + LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); + for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { + LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); + } + _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, + chunkQuerySpec.queries); + } + */ + nlohmann::json jsFrag = {{"resultTable", mInfo.resultName}, + {"queries", nlohmann::json::array()}, + {"subchunkTables", nlohmann::json::array()}, + {"subchunkIds", nlohmann::json::array()}}; + + auto& jsQueries = jsFrag["queries"]; + nlohmann::json jsQry = {{"subQuery", mInfo.qry}}; + jsQueries.push_back(move(jsQry)); + + jsFragments.push_back(move(jsFrag)); + + return jsJobMsgPtr; + } MySqlConfig newMySqlConfig() { string user = "qsmaster"; @@ -103,6 +210,7 @@ struct Fixture { BOOST_FIXTURE_TEST_SUITE(Basic, Fixture) BOOST_AUTO_TEST_CASE(Simple) { + /* &&& WorkerConfig::create(); shared_ptr msg(newTaskMsg()); shared_ptr sendC(SendChannel::newNopChannel()); @@ -111,13 +219,40 @@ BOOST_AUTO_TEST_CASE(Simple) { shared_ptr crm = ChunkResourceMgr::newMgr(backend); SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); auto const queries = queriesAndChunks(); + //&&& auto taskVect = Task::createTasks(msg, sc, crm, newMySqlConfig(), sqlConnMgr, queries); auto taskVect = Task::createTasks(msg, sc, crm, newMySqlConfig(), sqlConnMgr, queries); Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); + */ + + WorkerConfig::create(); + MsgInfo mInfo; + auto msgJson = newTaskJson(mInfo); + shared_ptr sendC(SendChannel::newNopChannel()); + auto sc = FileChannelShared::create(sendC, mInfo.czarId); + FakeBackend::Ptr backend = make_shared(); + shared_ptr crm = ChunkResourceMgr::newMgr(backend); + SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); + auto const queries = queriesAndChunks(); + auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, + mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + lsst::qserv::proto::ScanInfo scanInfo; + scanInfo.scanRating = mInfo.scanRating; + scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); + vector taskVect = Task::createTasksForChunk( + ujData, *msgJson, sc, scanInfo, + mInfo.scanInteractive, mInfo.maxTableSize, + crm, + newMySqlConfig(), sqlConnMgr, + queries); + Task::Ptr task = taskVect[0]; + QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); + BOOST_CHECK(a->runQuery()); } BOOST_AUTO_TEST_CASE(Output) { + /* &&& WorkerConfig::create(); string out; shared_ptr msg(newTaskMsg()); @@ -131,6 +266,32 @@ BOOST_AUTO_TEST_CASE(Output) { Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); + */ + WorkerConfig::create(); + string out; + MsgInfo mInfo; + auto msgJson = newTaskJson(mInfo); + shared_ptr sendC(SendChannel::newStringChannel(out)); + auto sc = FileChannelShared::create(sendC, mInfo.czarId); + FakeBackend::Ptr backend = make_shared(); + shared_ptr crm = ChunkResourceMgr::newMgr(backend); + SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); + auto const queries = queriesAndChunks(); + auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, + mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + lsst::qserv::proto::ScanInfo scanInfo; + scanInfo.scanRating = mInfo.scanRating; + scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); + vector taskVect = Task::createTasksForChunk( + ujData, *msgJson, sc, scanInfo, + mInfo.scanInteractive, mInfo.maxTableSize, + crm, + newMySqlConfig(), sqlConnMgr, + queries); + Task::Ptr task = taskVect[0]; + QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); + BOOST_CHECK(a->runQuery()); + } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/wdb/testQuerySql.cc b/src/wdb/testQuerySql.cc index 5d7cd4607..f28d733d6 100644 --- a/src/wdb/testQuerySql.cc +++ b/src/wdb/testQuerySql.cc @@ -39,6 +39,8 @@ namespace test = boost::test_tools; +//&&& delete file + using lsst::qserv::proto::TaskMsg_Fragment; using lsst::qserv::proto::TaskMsg_Subchunk; using lsst::qserv::wdb::QuerySql; diff --git a/src/wsched/testSchedulers.cc b/src/wsched/testSchedulers.cc index 13e40a0f5..2b3c4df5b 100644 --- a/src/wsched/testSchedulers.cc +++ b/src/wsched/testSchedulers.cc @@ -84,6 +84,7 @@ auto workerCfg = lsst::qserv::wconfig::WorkerConfig::create(); std::vector locSendSharedPtrs; +/* &&& Task::Ptr makeTask(std::shared_ptr tm, shared_ptr const& queries) { WorkerConfig::create(); auto sendC = std::make_shared(); @@ -94,6 +95,7 @@ Task::Ptr makeTask(std::shared_ptr tm, shared_ptr con task->setSafeToMoveRunning(true); // Can't wait for MemMan in unit tests. return task; } +*/ struct SchedulerFixture { typedef std::shared_ptr TaskMsgPtr; @@ -101,6 +103,7 @@ struct SchedulerFixture { SchedulerFixture(void) { counter = 20; } ~SchedulerFixture(void) {} + /* &&& Instead of using messages, make a Task::createUnitTest() function void addSomeFragments(TaskMsgPtr const& t, int numberOfFragments) { for (int i = 0; i < numberOfFragments; ++i) { TaskMsg::Fragment* f = t->add_fragment(); @@ -110,6 +113,7 @@ struct SchedulerFixture { } } + TaskMsgPtr newTaskMsg(int seq, lsst::qserv::QueryId qId, int jobId) { TaskMsgPtr t = std::make_shared(); t->set_queryid(qId); @@ -117,7 +121,7 @@ struct SchedulerFixture { t->set_chunkid(seq); t->set_czarid(1); t->set_db("elephant"); - addSomeFragments(t, 3); + //&&&addSomeFragments(t, 3); t->set_scaninteractive(false); t->set_attemptcount(0); ++counter; @@ -133,7 +137,7 @@ struct SchedulerFixture { t->set_db("moose"); t->set_scaninteractive(false); t->set_attemptcount(0); - addSomeFragments(t, 1); + //&&&addSomeFragments(t, 1); ++counter; return t; } @@ -156,6 +160,8 @@ struct SchedulerFixture { gs.queCmd(t); return t; } + */ + int counter; }; @@ -216,6 +222,7 @@ struct SchedFixture { // TODO: DM-33302 replace this test case BOOST_AUTO_TEST_CASE(Grouping) { +#if 0 // &&& fix and re-enable SchedFixture f(60.0, 1); // Values to keep QueriesAndChunk from triggering. LOGS(_log, LOG_LVL_DEBUG, "Test_case grouping"); @@ -296,9 +303,11 @@ BOOST_AUTO_TEST_CASE(Grouping) { BOOST_CHECK(gs.getInFlight() == 10); BOOST_CHECK(gs.ready() == false); BOOST_CHECK(gs.empty() == true); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(GroupMaxThread) { +#if 0 // &&& fix and re-enable // Test that maxThreads is meaningful. LOGS(_log, LOG_LVL_WARN, "Test_case GroupMaxThread"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, @@ -329,9 +338,11 @@ BOOST_AUTO_TEST_CASE(GroupMaxThread) { auto aa4 = gs.getCmd(false); BOOST_CHECK(a4.get() == aa4.get()); BOOST_CHECK(gs.ready() == false); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(ScanScheduleTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ScanScheduleTest"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -392,9 +403,11 @@ BOOST_AUTO_TEST_CASE(ScanScheduleTest) { sched.commandFinish(tsk1); BOOST_CHECK(sched.getInFlight() == 0); BOOST_CHECK(sched.ready() == false); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(BlendScheduleTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case BlendScheduleTest"); // Test that space is appropriately reserved for each scheduler as Tasks are started and finished. // In this case, memMan->lock(..) always returns true (really HandleType::ISEMPTY). @@ -593,9 +606,11 @@ BOOST_AUTO_TEST_CASE(BlendScheduleTest) { BOOST_CHECK(f.blend->calcAvailableTheads() == 5); BOOST_CHECK(f.blend->getInFlight() == 0); LOGS(_log, LOG_LVL_DEBUG, "BlendScheduleTest-1 done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(BlendScheduleThreadLimitingTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case BlendScheduleThreadLimitingTest"); SchedFixture f(60.0, 1); // Values to keep QueriesAndChunk from triggering. // Test that only 6 threads can be started on a single ScanScheduler @@ -663,9 +678,11 @@ BOOST_AUTO_TEST_CASE(BlendScheduleThreadLimitingTest) { BOOST_CHECK(f.blend->getInFlight() == 0); BOOST_CHECK(f.blend->ready() == false); LOGS(_log, LOG_LVL_DEBUG, "BlendScheduleTest-2 done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(BlendScheduleQueryRemovalTest) { +#if 0 // &&& fix and re-enable // Test that space is appropriately reserved for each scheduler as Tasks are started and finished. // In this case, memMan->lock(..) always returns true (really HandleType::ISEMPTY). // ChunkIds matter as they control the order Tasks come off individual schedulers. @@ -723,9 +740,11 @@ BOOST_AUTO_TEST_CASE(BlendScheduleQueryRemovalTest) { auto schedForA = std::dynamic_pointer_cast(taskFromA->getTaskScheduler()); LOGS(_log, LOG_LVL_DEBUG, "taskFromA=" << taskFromA->getIdStr() << " sched=" << schedForA->getName()); BOOST_CHECK(schedForA == f.scanSlow); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(BlendScheduleQueryBootTaskTest) { +#if 0 // &&& fix and re-enable // Test if a task is removed if it takes takes too long. // Give the user query 0.1 seconds to run and run it for a second, it should get removed. double tenthOfSecInMinutes = 1.0 / 600.0; // task @@ -807,9 +826,11 @@ BOOST_AUTO_TEST_CASE(BlendScheduleQueryBootTaskTest) { LOGS(_log, LOG_LVL_INFO, "BlendScheduleQueryBootTaskTest waiting for pool to finish."); pool->shutdownPool(); LOGS(_log, LOG_LVL_INFO, "BlendScheduleQueryBootTaskTest done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(SlowTableHeapTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case SlowTableHeapTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -842,9 +863,11 @@ BOOST_AUTO_TEST_CASE(SlowTableHeapTest) { BOOST_CHECK(heap.pop().get() == a4.get()); BOOST_CHECK(heap.empty() == true); LOGS(_log, LOG_LVL_DEBUG, "SlowTableHeapTest done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(ChunkTasksTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ChunkTasksTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -915,9 +938,11 @@ BOOST_AUTO_TEST_CASE(ChunkTasksTest) { chunkTasks.taskComplete(a4); BOOST_CHECK(chunkTasks.readyToAdvance() == true); LOGS(_log, LOG_LVL_DEBUG, "ChunkTasksTest done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(ChunkTasksQueueTest) { +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ChunkTasksQueueTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -1033,6 +1058,7 @@ BOOST_AUTO_TEST_CASE(ChunkTasksQueueTest) { BOOST_CHECK(ctl.ready(true) == false); BOOST_CHECK(ctl.getActiveChunkId() == -1); LOGS(_log, LOG_LVL_DEBUG, "ChunkTasksQueueTest done"); +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/xrdsvc/SsiRequest.cc b/src/xrdsvc/SsiRequest.cc index ec295cfd1..1b4ca9aeb 100644 --- a/src/xrdsvc/SsiRequest.cc +++ b/src/xrdsvc/SsiRequest.cc @@ -93,13 +93,6 @@ void SsiRequest::execute(XrdSsiRequest& req) { util::Timer t; LOGS(_log, LOG_LVL_DEBUG, "Execute request, resource=" << _resourceName); - char* reqData = nullptr; - int reqSize; - t.start(); - reqData = req.GetRequest(reqSize); - t.stop(); - LOGS(_log, LOG_LVL_DEBUG, "GetRequest took " << t.getElapsed() << " seconds"); - // We bind this object to the request now. This allows us to respond at any // time (much simpler). Though the manual forgot to say that all pending // events will be reflected on a different thread the moment we bind the @@ -122,71 +115,20 @@ void SsiRequest::execute(XrdSsiRequest& req) { // Process the request switch (ru.unitType()) { - case ResourceUnit::DBCHUNK: { + case ResourceUnit::DBCHUNK: { // &&& delete // Increment the counter of the database/chunk resources in use - _foreman->resourceMonitor()->increment(_resourceName); - - // reqData has the entire request, so we can unpack it without waiting for - // more data. - LOGS(_log, LOG_LVL_DEBUG, "Decoding TaskMsg of size " << reqSize); - auto taskMsg = std::make_shared(); - if (!taskMsg->ParseFromArray(reqData, reqSize) || !taskMsg->IsInitialized()) { - reportError("Failed to decode TaskMsg on resource db=" + ru.db() + - " chunkId=" + std::to_string(ru.chunk())); - return; - } - - QSERV_LOGCONTEXT_QUERY_JOB(taskMsg->queryid(), taskMsg->jobid()); - - if (!taskMsg->has_db() || !taskMsg->has_chunkid() || (ru.db() != taskMsg->db()) || - (ru.chunk() != taskMsg->chunkid())) { - reportError("Mismatched db/chunk in TaskMsg on resource db=" + ru.db() + - " chunkId=" + std::to_string(ru.chunk())); - return; - } - - if (not(taskMsg->has_queryid() && taskMsg->has_jobid() && taskMsg->has_scaninteractive() && - taskMsg->has_attemptcount() && taskMsg->has_czarid())) { - reportError(std::string("taskMsg missing required field ") + - " queryid:" + std::to_string(taskMsg->has_queryid()) + - " jobid:" + std::to_string(taskMsg->has_jobid()) + - " scaninteractive:" + std::to_string(taskMsg->has_scaninteractive()) + - " attemptcount:" + std::to_string(taskMsg->has_attemptcount()) + - " czarid:" + std::to_string(taskMsg->has_czarid())); - return; - } - switch (wconfig::WorkerConfig::instance()->resultDeliveryProtocol()) { - case wconfig::ConfigValResultDeliveryProtocol::XROOT: - case wconfig::ConfigValResultDeliveryProtocol::HTTP: - _channelShared = wbase::FileChannelShared::create(sendChannel, taskMsg->czarid(), - _foreman->chunkInventory()->id()); - break; - default: - throw std::runtime_error("SsiRequest::" + std::string(__func__) + - " unsupported result delivery protocol"); - } - auto const tasks = wbase::Task::createTasks(taskMsg, _channelShared, _foreman->chunkResourceMgr(), - _foreman->mySqlConfig(), _foreman->sqlConnMgr(), - _foreman->queriesAndChunks(), _foreman->httpPort()); - for (auto const& task : tasks) { - _tasks.push_back(task); - } + _foreman->resourceMonitor()->increment(_resourceName); // &&& TODO:UJ make sure this is implemented elsewhere. - // Now that the request is decoded (successfully or not), release the - // xrootd request buffer. To avoid data races, this must happen before - // the task is handed off to another thread for processing, as there is a - // reference to this SsiRequest inside the reply channel for the task, - // and after the call to BindRequest. - ReleaseRequestBuffer(); - t.start(); - _foreman->processTasks(tasks); // Queues tasks to be run later. - t.stop(); - LOGS(_log, LOG_LVL_DEBUG, - "Enqueued TaskMsg for " << ru << " in " << t.getElapsed() << " seconds"); + reportError("&&& DBCHUNK requests are no longer available resource db=" + ru.db() + + " chunkId=" + std::to_string(ru.chunk())); break; } - case ResourceUnit::QUERY: { + case ResourceUnit::QUERY: { // &&& delete LOGS(_log, LOG_LVL_DEBUG, "Parsing request details for resource=" << _resourceName); + + reportError("&&& QUERY requests are no longer available"); + + /* &&& proto::QueryManagement request; try { // reqData has the entire request, so we can unpack it without waiting for @@ -236,16 +178,16 @@ void SsiRequest::execute(XrdSsiRequest& req) { // Send back the empty response since no info is expected by a caller // for this type of requests beyond the usual error notifications (if any). this->reply((char const*)0, 0); + */ break; } default: reportError("Unexpected unit type '" + std::to_string(ru.unitType()) + "', resource name: " + _resourceName); - break; } - // Note that upon exit the _finMutex will be unlocked allowing Finished() // to actually do something once everything is actually setup. + } /// Called by SSI to free resources. diff --git a/src/xrdsvc/SsiRequest.h b/src/xrdsvc/SsiRequest.h index 3583a0cef..5850d18bf 100644 --- a/src/xrdsvc/SsiRequest.h +++ b/src/xrdsvc/SsiRequest.h @@ -60,7 +60,7 @@ class StreamBuffer; /// qserv worker services. The SSI interface encourages such an approach, and /// object lifetimes are explicitly stated in the documentation which we /// adhere to using BindRequest() and UnBindRequest() responder methods. -class SsiRequest : public XrdSsiResponder, public std::enable_shared_from_this { +class SsiRequest : public XrdSsiResponder, public std::enable_shared_from_this { // &&& delete if possible public: // Smart pointer definitions From 5bb2b081da8aad47d9ce45d9606211eec1387e2f Mon Sep 17 00:00:00 2001 From: John Gates Date: Tue, 6 Aug 2024 08:24:17 -0700 Subject: [PATCH 03/22] Added ActiveWorker. --- src/ccontrol/UserQuerySelect.cc | 1 + src/czar/ActiveWorker.cc | 51 ++++++++++++ src/czar/ActiveWorker.h | 126 +++++++++++++++++++++++++++++ src/czar/CMakeLists.txt | 1 + src/czar/CzarRegistry.cc | 10 +-- src/czar/CzarRegistry.h | 3 + src/qdisp/JobQuery.h | 1 - src/qdisp/UberJob.h | 4 +- src/xrdreq/QueryManagementAction.h | 1 + 9 files changed, 187 insertions(+), 11 deletions(-) create mode 100644 src/czar/ActiveWorker.cc create mode 100644 src/czar/ActiveWorker.h diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 0fca556f7..d7d4fb5a0 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -528,6 +528,7 @@ QueryState UserQuerySelect::join() { auto const czarConfig = cconfig::CzarConfig::instance(); if (czarConfig->notifyWorkersOnQueryFinish()) { try { + // &&& do this another way, also see executive::squash xrdreq::QueryManagementAction::notifyAllWorkers(czarConfig->getXrootdFrontendUrl(), operation, _qMetaCzarId, _qMetaQueryId); } catch (std::exception const& ex) { diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc new file mode 100644 index 000000000..48b4752b4 --- /dev/null +++ b/src/czar/ActiveWorker.cc @@ -0,0 +1,51 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "czar/ActiveWorker.h" + +// System headers +#include + +// Third party headers + +// Qserv headers +#include "util/common.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.ActiveWorker"); +} // namespace + +namespace lsst::qserv::czar { + +string WorkerContactInfo::dump() const { + stringstream os; + os << "workerContactInfo{" + << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; + return os.str(); +} + +} // namespace lsst::qserv::czar diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h new file mode 100644 index 000000000..8a2a6da6e --- /dev/null +++ b/src/czar/ActiveWorker.h @@ -0,0 +1,126 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_CZAR_ACTIVEWORKER_H +#define LSST_QSERV_CZAR_ACTIVEWORKER_H + +// System headers +#include +#include +#include +#include +#include + +// qserv headers +#include "global/clock_defs.h" +#include "global/intTypes.h" + +// This header declarations +namespace lsst::qserv::czar { + +class WorkerContactInfo { +public: + using Ptr = std::shared_ptr; + + WorkerContactInfo(std::string const& wId_, std::string const& wHost_, + std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) + : wId(wId_), + wHost(wHost_), + wManagementHost(wManagementHost_), + wPort(wPort_) { + touchedChanged(updateTime_, false); + } + std::string const wId; ///< key + std::string const wHost; ///< "host-addr" entry. + std::string const wManagementHost; ///< "management-host-name" entry. + int const wPort; ///< "management-port" entry. + + + /// Return true if all members, aside from updateTime, are equal. + bool sameContactInfo(WorkerContactInfo const& other) const { + return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && + wPort == other.wPort); + } + + /// To be called when the worker list was updated and there was a change. + void touchedChanged(TIMEPOINT updateTime, bool missing) { + _missing = missing; + touchedNoChange(updateTime); + } + + /// To be called when the worker list was updated and there was no change. + void touchedNoChange(TIMEPOINT updateTime = CLOCK::now()) { + if (!_missing) { + _lastTouch = updateTime; + } + } + + double timeSinceTouchSeconds() { + double secs = std::chrono::duration_cast(CLOCK::now() - _lastTouch).count(); + return secs; + } + + std::string dump() const; + +private: + TIMEPOINT _lastTouch; ///< Last time this worker believed to be active. + bool _missing = false; ///< True if the worker was missing after the last change. +}; + +/// &&& doc - maintain list of done/cancelled queries for an active worker, and send that +/// list to the worker. Once the worker has accepted the list, remove all +/// of those queryId's from the list. +class ActiveWorker { +public: + using Ptr = std::shared_ptr; + + ActiveWorker() = delete; + ActiveWorker(ActiveWorker const&) = delete; + ActiveWorker& operator=(ActiveWorker const&) = delete; + + static Ptr create(WorkerContactInfo::Ptr const& wInfo) { + return Ptr(new ActiveWorker(wInfo)); + } + + ~ActiveWorker() = default; + +private: + ActiveWorker(WorkerContactInfo::Ptr const& wInfo) : _wInfo(wInfo) {} + + std::set _qIdDoneKeepFiles; ///< &&& doc - limit reached + std::set _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + + WorkerContactInfo::Ptr const _wInfo; +}; + +/// &&& doc +class ActiveWorkerMap { +public: + ActiveWorkerMap() = default; + + void updateMap(std::string const& wId); + +private: + std::map _awMap; +}; + +} // namespace lsst::qserv::czar + +#endif // LSST_QSERV_CZAR_ACTIVEWORKER_H diff --git a/src/czar/CMakeLists.txt b/src/czar/CMakeLists.txt index fbca091b9..3d9d32695 100644 --- a/src/czar/CMakeLists.txt +++ b/src/czar/CMakeLists.txt @@ -2,6 +2,7 @@ add_library(czar OBJECT) add_dependencies(czar proto) target_sources(czar PRIVATE + ActiveWorker.cc ChttpModule.cc Czar.cc CzarChunkMap.cc diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index f5abfcaba..8f2e2529c 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -149,7 +149,8 @@ CzarRegistry::WorkerContactMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json int wPort = jsQserv.at("management-port").get(); uint64_t updateTimeInt = jsQserv.at("update-time-ms").get(); TIMEPOINT updateTime = TIMEPOINT(chrono::milliseconds(updateTimeInt)); - auto wInfo = make_shared(key, wHost, wManagementHost, wPort, updateTime); + //&&&auto wInfo = make_shared(key, wHost, wManagementHost, wPort, updateTime); + auto wInfo = make_shared(key, wHost, wManagementHost, wPort); LOGS(_log, LOG_LVL_DEBUG, __func__ << " wHost=" << wHost << " wPort=" << wPort << " updateTime=" << updateTimeInt); auto iter = wMap->find(key); @@ -188,11 +189,4 @@ bool CzarRegistry::_compareMap(WorkerContactMap const& other) const { return true; } -string CzarRegistry::WorkerContactInfo::dump() const { - stringstream os; - os << "workerContactInfo{" - << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; - return os.str(); -} - } // namespace lsst::qserv::czar diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index 27d20979c..10e4af85d 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -34,6 +34,7 @@ #include "nlohmann/json.hpp" // Qserv headers +#include "czar/ActiveWorker.h" #include "global/clock_defs.h" namespace lsst::qserv::cconfig { @@ -66,6 +67,7 @@ class CzarRegistry { ~CzarRegistry(); + /* &&& struct WorkerContactInfo { using Ptr = std::shared_ptr; @@ -89,6 +91,7 @@ class CzarRegistry { } std::string dump() const; }; + */ using WorkerContactMap = std::unordered_map; using WorkerContactMapPtr = std::shared_ptr; diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index 23fa4fc86..802cc44fc 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -64,7 +64,6 @@ class JobQuery : public JobBase { QueryId getQueryId() const override { return _qid; } JobId getJobId() const override { return _jobDescription->id(); } - //&&&std::string const& getPayload() const override; std::string const& getIdStr() const override { return _idStr; } std::shared_ptr getRespHandler() override { return _jobDescription->respHandler(); } bool getScanInteractive() const override { return _jobDescription->getScanInteractive(); } diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 9a5ed13ab..66e631be1 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -95,7 +95,7 @@ class UberJob : public JobBase { /// Set the worker information needed to send messages to the worker believed to /// be responsible for the chunks handled in this UberJob. - void setWorkerContactInfo(czar::CzarRegistry::WorkerContactInfo::Ptr const& wContactInfo) { + void setWorkerContactInfo(czar::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? _wContactInfo = wContactInfo; } @@ -159,7 +159,7 @@ class UberJob : public JobBase { czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed // Contact information for the target worker. - czar::CzarRegistry::WorkerContactInfo::Ptr _wContactInfo; + czar::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? }; } // namespace lsst::qserv::qdisp diff --git a/src/xrdreq/QueryManagementAction.h b/src/xrdreq/QueryManagementAction.h index f1779cae5..ec5ff9158 100644 --- a/src/xrdreq/QueryManagementAction.h +++ b/src/xrdreq/QueryManagementAction.h @@ -39,6 +39,7 @@ namespace lsst::qserv::xrdreq { * Class QueryManagementAction is an interface for managing query completion/cancellation * at all Qserv workers that are connected as "publishers" to the XROOTD redirector. */ +// &&& need to get the same functionality using json messages, and not in xrdreq. class QueryManagementAction : public std::enable_shared_from_this { public: /// The reponse type represents errors reported by the workers, where worker From 74c3a573e173e2c909fc48388f6ecb6e9d5336de Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 30 Aug 2024 10:22:52 -0700 Subject: [PATCH 04/22] Added unit test for query status message. --- src/czar/ActiveWorker.cc | 316 ++++++++++++++++++++++++++- src/czar/ActiveWorker.h | 188 +++++++++++++--- src/czar/Czar.cc | 14 +- src/czar/Czar.h | 8 +- src/czar/CzarChunkMap.cc | 3 + src/czar/CzarRegistry.cc | 31 ++- src/czar/CzarRegistry.h | 49 ++--- src/global/intTypes.h | 1 + src/http/CMakeLists.txt | 2 + src/http/WorkerQueryStatusData.cc | 331 +++++++++++++++++++++++++++++ src/http/WorkerQueryStatusData.h | 234 ++++++++++++++++++++ src/http/testStatusData.cc | 140 ++++++++++++ src/qdisp/UberJob.h | 6 +- src/qmeta/types.h | 2 +- src/xrdsvc/HttpSvc.cc | 6 + src/xrdsvc/HttpWorkerCzarModule.cc | 15 ++ src/xrdsvc/HttpWorkerCzarModule.h | 6 + 17 files changed, 1267 insertions(+), 85 deletions(-) create mode 100644 src/http/WorkerQueryStatusData.cc create mode 100644 src/http/WorkerQueryStatusData.h create mode 100644 src/http/testStatusData.cc diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 48b4752b4..78b7d04f0 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -25,15 +25,17 @@ // System headers #include -// Third party headers - // Qserv headers +#include "cconfig/CzarConfig.h" +#include "http/Client.h" +#include "http/MetaModule.h" #include "util/common.h" // LSST headers #include "lsst/log/Log.h" using namespace std; +using namespace nlohmann; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.ActiveWorker"); @@ -41,11 +43,321 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.ActiveWorker"); namespace lsst::qserv::czar { +/* &&& string WorkerContactInfo::dump() const { stringstream os; os << "workerContactInfo{" << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; return os.str(); } +*/ + +string ActiveWorker::getStateStr(State st) { + switch (st) { + case ALIVE: return string("ALIVE"); + case QUESTIONABLE: return string("QUESTIONABLE"); + case DEAD: return string("DEAD"); + } + return string("unknown"); +} + +bool ActiveWorker::compareContactInfo(http::WorkerContactInfo const& wcInfo) const { + lock_guard lg(_aMtx); + return _wqsData->_wInfo->isSameContactInfo(wcInfo); +} + +void ActiveWorker::setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " new info=" << wcInfo->dump()); + lock_guard lg(_aMtx); + _wqsData->_wInfo = wcInfo; +} + +void ActiveWorker::_changeStateTo(State newState, double secsSinceUpdate, string const& note) { + auto lLvl = (newState == DEAD) ? LOG_LVL_ERROR : LOG_LVL_INFO; + LOGS(_log, lLvl, note << " oldState=" << getStateStr(_state) << " newState=" << getStateStr(newState) << " secsSince=" << secsSinceUpdate); + _state = newState; +} + +void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { + // &&& function too long + lock_guard lg(_aMtx); + double secsSinceUpdate = _wqsData->_wInfo->timeSinceRegUpdateSeconds(); + // Update the last time the registry contacted this worker. + switch (_state) { + case ALIVE: { + if (secsSinceUpdate > timeoutAliveSecs) { + _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); + // Anything that should be done here? + } + break; + } + case QUESTIONABLE: { + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + } + if (secsSinceUpdate > timeoutDeadSecs) { + _changeStateTo(DEAD, secsSinceUpdate, cName(__func__)); + // &&& TODO:UJ all uberjobs for this worker need to die. + } + break; + } + case DEAD: { + LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE"); + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + } else { + // Don't waste time on this worker until the registry has heard from it. + return; + } + break; + } + + } + + // Check how many messages are currently being sent to the worker, if at the limit, return + if (_wqsData->_qIdDoneKeepFiles.empty() && _wqsData->_qIdDoneDeleteFiles.empty() && _wqsData->_qIdDeadUberJobs.empty()) { + return; + } + int tCount = _conThreadCount; + if (tCount > _maxConThreadCount) { + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " not sending message since at max threads " << tCount); + return; + } + + // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a + // message to send to the worker. +#if 0 // &&& + auto now = CLOCK::now(); + auto const czarConfig = cconfig::CzarConfig::instance(); + + shared_ptr jsWorkerReqPtr = make_shared(); + json& jsWorkerR = *jsWorkerReqPtr; + jsWorkerR["version"] = http::MetaModule::version; + jsWorkerR["instance_id"] = czarConfig->replicationInstanceId(); + jsWorkerR["auth_key"] = czarConfig->replicationAuthKey(); + jsWorkerR["worker"] = _wInfo->wId; + jsWorkerR["qiddonekeepfiles"] = json::array(); + jsWorkerR["qiddonedeletefiles"] = json::array(); + jsWorkerR["qiddeaduberjobs"] = json::array(); + jsWorkerR["czar"] = json::object(); + auto& jsWCzar = jsWorkerR["czar"]; + jsWCzar["name"] = czarConfig->name(); + jsWCzar["id"]= czarConfig->id(); + jsWCzar["management-port"] = czarConfig->replicationHttpPort(); + jsWCzar["management-host-name"] = util::get_current_host_fqdn(); + + + { + auto& jsDoneKeep = jsWorkerR["qiddonekeepfiles"]; + auto iterDoneKeep = _qIdDoneKeepFiles.begin(); + while (iterDoneKeep != _qIdDoneKeepFiles.end()) { + auto qId = iterDoneKeep->first; + jsDoneKeep.push_back(qId); + auto tmStamp = iterDoneKeep->second; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterDoneKeep = _qIdDoneKeepFiles.erase(iterDoneKeep); + } else { + ++iterDoneKeep; + } + } + } + { + auto& jsDoneDelete = jsWorkerR["qiddonedeletefiles"]; + auto iterDoneDelete = _qIdDoneDeleteFiles.begin(); + while (iterDoneDelete != _qIdDoneDeleteFiles.end()) { + auto qId = iterDoneDelete->first; + jsDoneDelete.push_back(qId); + auto tmStamp = iterDoneDelete->second; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterDoneDelete = _qIdDoneDeleteFiles.erase(iterDoneDelete); + } else { + ++iterDoneDelete; + } + } + } + { + auto& jsDeadUj = jsWorkerR["qiddeaduberjobs"]; + auto iterDeadUjQid = _qIdDeadUberJobs.begin(); + while (iterDeadUjQid != _qIdDeadUberJobs.end()) { + TIMEPOINT oldestTm; // default is zero + auto qId = iterDeadUjQid->first; + auto& ujIdMap = iterDeadUjQid->second; + + json jsQidUj = {{"qid", qId}, {"ujids", json::array()}}; + auto& jsUjIds = jsQidUj["ujids"]; + + auto iterUjId = ujIdMap.begin(); + bool addedUjId = false; + while (iterUjId != ujIdMap.end()) { + UberJobId ujId = iterUjId->first; + auto tmStamp = iterUjId->second; + if (tmStamp > oldestTm) { + oldestTm = tmStamp; + } + + jsUjIds.push_back(ujId); + addedUjId = true; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterUjId = ujIdMap.erase(iterUjId); + } else { + ++iterUjId; + } + } + + if (addedUjId) { + jsDeadUj.push_back(jsQidUj); + } + + if (ujIdMap.empty() + || std::chrono::duration(now - oldestTm).count() > maxLifetime) { + iterDeadUjQid = _qIdDeadUberJobs.erase(iterDeadUjQid); + } else { + ++iterDeadUjQid; + } + } + } +#endif // &&& + + auto jsWorkerReqPtr = _wqsData->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + + // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) + // put this in a different function and start the thread.&&&; + _sendStatusMsg(jsWorkerReqPtr); +} + +#if 0 // &&& +bool ActiveWorker::_parse(nlohmann::json const& jsWorkerReq) { + auto const czarConfig = cconfig::CzarConfig::instance(); + + http::RequestBodyJSON rbWReq(jsWorkerReq); + if (jsWorkerReq["version"] != http::MetaModule::version) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " bad version"); + return false; + } + + + http::RequestBodyJSON rbCzar(rbWReq.required("czar")); + auto czarName = rbCzar.required("name"); + auto czarId = rbCzar.required("id"); + auto czarPort = rbCzar.required("management-port"); + auto czarHostName = rbCzar.required("management-host-name"); + /* &&& + jsWorkerReq["instance_id"] != czarConfig->replicationInstanceId(); + jsWorkerReq["auth_key"] != czarConfig->replicationAuthKey(); + jsWorkerReq["worker"] != _wInfo->wId; + auto& jsWCzar = jsWorkerReq["czar"]; + jsWCzar["name"] != czarConfig->name(); + jsWCzar["id"] != czarConfig->id(); + jsWCzar["management-port"] != czarConfig->replicationHttpPort(); + jsWCzar["management-host-name"] != util::get_current_host_fqdn(); + */ + + + auto& jsQIdDoneKeepFiles = jsWorkerReq["qiddonekeepfiles"]; + for (auto const& qidKeep : jsQIdDoneKeepFiles) { + + } + + auto& jsQIdDoneDeleteFiles = jsWorkerReq["qiddonedeletefiles"]; + + auto& jsQIdDeadUberJobs = jsWorkerReq["qiddeaduberjobs"]; + +} +#endif // &&& + +void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorkerReqPtr) { + + auto& jsWorkerReq = *jsWorkerReqPtr; + auto const method = http::Method::POST; + auto const& wInf = _wqsData->_wInfo; + string const url = "http://" + wInf->wHost + ":" + to_string(wInf->wPort) + "/querystatus"; + vector const headers = {"Content-Type: application/json"}; + auto const& czarConfig = cconfig::CzarConfig::instance(); + + + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " REQ " << jsWorkerReq); + string const requestContext = "Czar: '" + http::method2string(method) + "' stat request to '" + url + "'"; + LOGS(_log, LOG_LVL_TRACE, + cName(__func__) << " czarPost url=" << url << " request=" << jsWorkerReq.dump() + << " headers=" << headers[0]); + http::Client client(method, url, jsWorkerReq.dump(), headers); + bool transmitSuccess = false; + string exceptionWhat; + try { + json const response = client.readAsJson(); + if (0 != response.at("success").get()) { + transmitSuccess = true; + } else { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " response success=0"); + } + } catch (exception const& ex) { + LOGS(_log, LOG_LVL_WARN, requestContext + " failed, ex: " + ex.what()); + exceptionWhat = ex.what(); + } + if (!transmitSuccess) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " transmit failure"); + } else { + // parse the return statement and remove the indicated entries from the list + //HERE &&&; + } +} + + +string ActiveWorker::dump() const { + lock_guard lg(_aMtx); + return _dump(); +} + +string ActiveWorker::_dump() const { + stringstream os; + os << "ActiveWorker " << (_wqsData->dump()); + return os.str(); +} + + +void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + // Go through wcMap, update existing entries in _awMap, create new entries for those that don't exist, + lock_guard awLg(_awMapMtx); + for (auto const& [wcKey, wcVal] : wcMap) { + auto iter = _awMap.find(wcKey); + if (iter == _awMap.end()) { + auto newAW = ActiveWorker::create(wcVal, czInfo, replicationInstanceId, replicationAuthKey); + _awMap[wcKey] = newAW; + } else { + auto aWorker = iter->second; + if (!aWorker->compareContactInfo(*wcVal)) { + // This should not happen, but try to handle it gracefully if it does. + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " worker contact info changed for " << wcKey << " new=" << wcVal->dump() << " old=" << aWorker->dump()); + aWorker->setWorkerContactInfo(wcVal); + } + } + } +} + +/* &&& +void ActiveWorkerMap::pruneMap() { + lock_guard awLg(_awMapMtx); + for (auto iter = _awMap.begin(); iter != _awMap.end();) { + auto aWorker = iter->second; + if (aWorker->getWInfo()->timeSinceTouchSeconds() > _maxDeadTimeSeconds) { + iter = _awMap.erase(iter); + } else { + ++iter; + } + } +} +*/ + +void ActiveWorkerMap::sendActiveWorkersMessages() { + // Send messages to each active worker as needed + lock_guard lck(_awMapMtx); + for(auto&& [wName, awPtr] : _awMap) { + awPtr->updateStateAndSendMessages(_timeoutAliveSecs, _timeoutDeadSecs, _maxLifetime); + } +} + } // namespace lsst::qserv::czar diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 8a2a6da6e..0db7a0d76 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -28,24 +28,37 @@ #include #include +// Third party headers +#include "nlohmann/json.hpp" + // qserv headers -#include "global/clock_defs.h" -#include "global/intTypes.h" +// &&& #include "global/clock_defs.h" +// &&& #include "global/intTypes.h" +#include "http/WorkerQueryStatusData.h" + // This header declarations namespace lsst::qserv::czar { + +/* &&& +/// &&& doc This class just contains the worker id and network communication +/// information, but it may be desirable to store connections to the +/// worker here as well. class WorkerContactInfo { public: using Ptr = std::shared_ptr; + using WCMap = std::unordered_map; + using WCMapPtr = std::shared_ptr; + WorkerContactInfo(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) : wId(wId_), wHost(wHost_), wManagementHost(wManagementHost_), wPort(wPort_) { - touchedChanged(updateTime_, false); + regUpdateTime(updateTime_); } std::string const wId; ///< key std::string const wHost; ///< "host-addr" entry. @@ -54,71 +67,188 @@ class WorkerContactInfo { /// Return true if all members, aside from updateTime, are equal. - bool sameContactInfo(WorkerContactInfo const& other) const { + bool isSameContactInfo(WorkerContactInfo const& other) const { return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && wPort == other.wPort); } - /// To be called when the worker list was updated and there was a change. - void touchedChanged(TIMEPOINT updateTime, bool missing) { - _missing = missing; - touchedNoChange(updateTime); + void regUpdateTime(TIMEPOINT updateTime) { + std::lock_guard lg(_rMtx); + _regUpdate = updateTime; } - /// To be called when the worker list was updated and there was no change. - void touchedNoChange(TIMEPOINT updateTime = CLOCK::now()) { - if (!_missing) { - _lastTouch = updateTime; - } + double timeSinceRegUpdateSeconds() const { + std::lock_guard lg(_rMtx); + double secs = std::chrono::duration(CLOCK::now() - _regUpdate).count(); + return secs; } - double timeSinceTouchSeconds() { - double secs = std::chrono::duration_cast(CLOCK::now() - _lastTouch).count(); - return secs; + TIMEPOINT getRegUpdate() const { + std::lock_guard lg(_rMtx); + return _regUpdate; } std::string dump() const; private: - TIMEPOINT _lastTouch; ///< Last time this worker believed to be active. - bool _missing = false; ///< True if the worker was missing after the last change. + /// Last time the registry heard from this worker. The ActiveWorker class + /// will use this to determine the worker's state. + /// &&& Store in seconds since epoch to make atomic? + TIMEPOINT _regUpdate; + + mutable std::mutex _rMtx; ///< protects _regUpdate }; +*/ -/// &&& doc - maintain list of done/cancelled queries for an active worker, and send that -/// list to the worker. Once the worker has accepted the list, remove all -/// of those queryId's from the list. -class ActiveWorker { +/// &&& doc - maintain list of done/cancelled queries for an active worker, and send +/// that list to the worker. Once the worker has accepted the list, remove +/// all of those queryId's from the list. +/// - maintain a list of killed UberJobs. If an UberJob is killed, nothing +/// will every look for its files, so they should be deleted, and the +/// worker should avoid working on Tasks for that UberJob. +/// The only UberJob deaths that need to be sent to a worker is when +/// the czar kills an UberJob because the worker died/vanished, and +/// the only time this would be sent is when a worker came back from +/// the dead. +/// The reason this only applies to died/vanished workers is that all +/// other workers know their UberJobs are dead because the worker killed +/// them. If the worker isn't told, it will continue working on +/// the UberJob until it finishes, and then find out the UberJob was killed +/// when it tries to return results to the czar (worker should delete files +/// for said UberJob at that point). +/// So, this should be very rare, only results in extra load, and therefore +/// is a low priority. +/// +/// If a worker goes missing from the registry, it is considered DEAD and will be +/// removed after a period of time. +/// If a worker hasn't been heard from in (timeout period), it is considered QUESIONABLE. +/// When switching to QUESTIONABLE, a message will be sent to the worker asking +/// for an update. +/// If a QUESTIONABLE worker hasn't been heard from in (timeout period), its state is changed +/// to LOST_CONTACT and a message is sent to the worker asking for an update. +/// If a LOST_CONTACT worker hasn't been heard from in (timeout period), it becomes DEAD. +/// +/// When a worker becomes DEAD: (this should probably all happen in _monitor). +/// - Affected UberJobs are killed. +/// - maps are remade without the dead workers +/// - uberjobs built to handle unassigned jobs. +/// +class ActiveWorker : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; + enum State { + ALIVE = 0, + QUESTIONABLE, + DEAD + }; + ActiveWorker() = delete; ActiveWorker(ActiveWorker const&) = delete; ActiveWorker& operator=(ActiveWorker const&) = delete; - static Ptr create(WorkerContactInfo::Ptr const& wInfo) { - return Ptr(new ActiveWorker(wInfo)); + std::string cName(const char* fName) { + return std::string("ActiveWorker::") + fName + " " + ((_wqsData == nullptr) ? "?" : _wqsData->dump()); + } + + static std::string getStateStr(State st); + + static Ptr create(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + return Ptr(new ActiveWorker(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); + } + + http::WorkerContactInfo::Ptr getWInfo() const { + if (_wqsData == nullptr) return nullptr; + return _wqsData->_wInfo; } ~ActiveWorker() = default; + /// &&& doc + bool compareContactInfo(http::WorkerContactInfo const& wcInfo) const; + + void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo); + + /// &&& doc + void updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime); + + std::string dump() const; + private: - ActiveWorker(WorkerContactInfo::Ptr const& wInfo) : _wInfo(wInfo) {} + ///&&&ActiveWorker(WorkerContactInfo::Ptr const& wInfo) : _wInfo(wInfo) {} + ActiveWorker(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, std::string const& replicationAuthKey) + : _wqsData(http::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, replicationAuthKey)) {} + + /// &&& doc + /// _aMtx must be held before calling. + void _changeStateTo(State newState, double secsSinceUpdate, std::string const& note); + + /// &&& doc + void _sendStatusMsg(std::shared_ptr const& jsWorkerReqPtr); + + /// &&& doc + /// _aMtx must be held before calling. + std::string _dump() const; + + /* &&& + std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached + std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + std::map> _qIdDeadUberJobs; ///< &&& doc + + /// &&& TODO:UJ Worth the effort to inform worker of killed UberJobs? + //std::map> _killedUberJobs; + + WorkerContactInfo::Ptr _wInfo; ///< &&& doc + */ + http::WorkerQueryStatusData::Ptr _wqsData; ///< &&& doc - std::set _qIdDoneKeepFiles; ///< &&& doc - limit reached - std::set _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + State _state{QUESTIONABLE}; ///< current state of this worker. - WorkerContactInfo::Ptr const _wInfo; + mutable std::mutex _aMtx; ///< protects _wInfo, _state, _qIdDoneKeepFiles, _qIdDoneDeleteFiles + + /// The number of communication threads currently in use by this class instance. + std::atomic _conThreadCount{0}; + int _maxConThreadCount{2}; + + /// &&& doc + /// @throws std::invalid_argument + bool _parse(nlohmann::json const& jsWorkerReq); // &&& delete after basic testing }; /// &&& doc +/// Maintain a list of all workers, indicating which are considered active. Communication +/// problems with workers could cause interesting race conditions, so workers will remain +/// on the list for a very long time after they have disappeared in the off chance they +/// come back from the dead. class ActiveWorkerMap { public: ActiveWorkerMap() = default; + ActiveWorkerMap(ActiveWorkerMap const&) = delete; + ActiveWorkerMap operator=(ActiveWorkerMap const&) = delete; + ~ActiveWorkerMap() = default; + + std::string cName(const char* fName) { + return std::string("ActiveWorkerMap::") + fName + " "; + } - void updateMap(std::string const& wId); + /// &&& doc + void updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey); + + //&&&void pruneMap(); /// &&& may not be needed ??? + + // &&& doc + void sendActiveWorkersMessages(); private: std::map _awMap; + std::mutex _awMapMtx; ///< protects _awMap; + + //&&&double const _maxDeadTimeSeconds = 60.0 * 15.0; ///< &&& set from config. + double _timeoutAliveSecs = 60.0 * 5.0; ///< &&& set from config. 5min + double _timeoutDeadSecs = 60.0 * 10.0; ///< &&& set from config. 10min + double _maxLifetime = 60.0 * 60.0; ///< &&& set from config. 1hr }; } // namespace lsst::qserv::czar diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index 0c949c2c8..75bae4107 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -43,6 +43,7 @@ #include "ccontrol/UserQueryResources.h" #include "ccontrol/UserQuerySelect.h" #include "ccontrol/UserQueryType.h" +#include "czar/ActiveWorker.h" #include "czar/CzarChunkMap.h" #include "czar/CzarErrors.h" #include "czar/HttpSvc.h" @@ -96,11 +97,14 @@ void Czar::_monitor() { /// Check database for changes in worker chunk assignments and aliveness _czarFamilyMap->read(); - // TODO:UJ DM-45470 If there were changes in `_czarFamilyMap`, + // old TODO:UJ DM-45470 If there were changes in `_czarFamilyMap`, // see if any workers went down. If any did, `_unassign` all // Jobs in UberJobs for the downed workers. The `_unassigned` // Jobs should get reassigned in the next section `assignJobsToUberJobs`. + // &&& Send appropriate messages to all ActiveWorkers + _czarRegistry->sendActiveWorkersMessages(); + /// Create new UberJobs (if possible) for all jobs that are /// unassigned for any reason. map> execMap; @@ -139,6 +143,11 @@ void Czar::_monitor() { // the czar about a cancelled user query, or the executive for that // query cannot be found, the worker should cancel all Tasks associated // with that queryId. + // &&& Go through the ActiveWorkerMap. Each ActiveWorker instance has a list of QueryIds + // that have not yet been acknowledged by the worker, so send a message to each worker + // with that list. + + } } @@ -148,7 +157,8 @@ Czar::Czar(string const& configFilePath, string const& czarName) _czarConfig(cconfig::CzarConfig::create(configFilePath, czarName)), _idCounter(), _uqFactory(), - _clientToQuery() { + _clientToQuery(), + _activeWorkerMap(new ActiveWorkerMap()){ // set id counter to milliseconds since the epoch, mod 1 year. struct timeval tv; gettimeofday(&tv, nullptr); diff --git a/src/czar/Czar.h b/src/czar/Czar.h index 9a39eacce..bf4131a2b 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -52,6 +52,7 @@ class CzarConfig; } // namespace lsst::qserv::cconfig namespace lsst::qserv::czar { +class ActiveWorkerMap; class HttpSvc; } // namespace lsst::qserv::czar @@ -215,8 +216,11 @@ class Czar { /// Set to false on system shutdown to stop _monitorThrd. std::atomic _monitorLoop{true}; - std::chrono::milliseconds _monitorSleepTime{ - 15000}; ///< Wait time between checks. TODO:UJ set from config + + /// Wait time between checks. TODO:UJ set from config + std::chrono::milliseconds _monitorSleepTime{15000}; + + std::unique_ptr _activeWorkerMap; }; } // namespace lsst::qserv::czar diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 166c6414b..7116aa1cc 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -300,6 +300,9 @@ bool CzarFamilyMap::_read() { return false; } + // &&& TODO:UJ Before makeNewMaps(), get a list of workers considered to be alive by czar::_activeWorkerMap + // give that list to makeNewMaps, and don't and workers to the maps that aren't on the list.&&& !!! + // Make the new maps. shared_ptr familyMapPtr = makeNewMaps(qChunkMap); diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 8f2e2529c..0824d0ad8 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -103,6 +103,9 @@ void CzarRegistry::_registryUpdateLoop() { void CzarRegistry::_registryWorkerInfoLoop() { // Get worker information from the registry + string const replicationInstanceId = _czarConfig->replicationInstanceId(); + string const replicationAuthKey = _czarConfig->replicationAuthKey(); + vector const headers; auto const method = http::Method::GET; string const url = "http://" + _czarConfig->replicationRegistryHost() + ":" + @@ -119,13 +122,16 @@ void CzarRegistry::_registryWorkerInfoLoop() { LOGS(_log, LOG_LVL_ERROR, requestContext + " was denied, error: '" + error + "'."); // TODO: Is there a better thing to do than just log this here? } else { - WorkerContactMapPtr wMap = _buildMapFromJson(response); + http::WorkerContactInfo::WCMapPtr wMap = _buildMapFromJson(response); // Compare the new map to the existing map and replace if different. { + auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), util::get_current_host_fqdn()); lock_guard lck(_mapMtx); - if (wMap != nullptr && !_compareMap(*wMap)) { + if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { _contactMap = wMap; - _latestUpdate = CLOCK::now(); + _latestMapUpdate = CLOCK::now(); + _activeWorkerMap.updateMap(*_contactMap, czInfo, replicationInstanceId, replicationAuthKey); + } } } @@ -137,10 +143,10 @@ void CzarRegistry::_registryWorkerInfoLoop() { } } -CzarRegistry::WorkerContactMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json const& response) { +http::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json const& response) { auto const& jsServices = response.at("services"); auto const& jsWorkers = jsServices.at("workers"); - auto wMap = WorkerContactMapPtr(new WorkerContactMap()); + auto wMap = http::WorkerContactInfo::WCMapPtr(new http::WorkerContactInfo::WCMap()); for (auto const& [key, value] : jsWorkers.items()) { auto const& jsQserv = value.at("qserv"); LOGS(_log, LOG_LVL_DEBUG, __func__ << " key=" << key << " jsQ=" << jsQserv); @@ -149,14 +155,13 @@ CzarRegistry::WorkerContactMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json int wPort = jsQserv.at("management-port").get(); uint64_t updateTimeInt = jsQserv.at("update-time-ms").get(); TIMEPOINT updateTime = TIMEPOINT(chrono::milliseconds(updateTimeInt)); - //&&&auto wInfo = make_shared(key, wHost, wManagementHost, wPort, updateTime); - auto wInfo = make_shared(key, wHost, wManagementHost, wPort); + auto wInfo = make_shared(key, wHost, wManagementHost, wPort, updateTime); LOGS(_log, LOG_LVL_DEBUG, __func__ << " wHost=" << wHost << " wPort=" << wPort << " updateTime=" << updateTimeInt); auto iter = wMap->find(key); if (iter != wMap->end()) { LOGS(_log, LOG_LVL_ERROR, __func__ << " duplicate key " << key << " in " << response); - if (!wInfo->sameContactInfo(*(iter->second))) { + if (!wInfo->isSameContactInfo(*(iter->second))) { LOGS(_log, LOG_LVL_ERROR, __func__ << " incongruent key " << key << " in " << response); return nullptr; } @@ -168,7 +173,7 @@ CzarRegistry::WorkerContactMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json return wMap; } -bool CzarRegistry::_compareMap(WorkerContactMap const& other) const { +bool CzarRegistry::_compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const { if (_contactMap == nullptr) { // If _contactMap is null, it needs to be replaced. return false; @@ -181,7 +186,7 @@ bool CzarRegistry::_compareMap(WorkerContactMap const& other) const { if (iter == other.end()) { return false; } else { - if (!(iter->second->sameContactInfo(*wInfo))) { + if (!(iter->second->isSameContactInfo(*wInfo))) { return false; } } @@ -189,4 +194,10 @@ bool CzarRegistry::_compareMap(WorkerContactMap const& other) const { return true; } +void CzarRegistry::sendActiveWorkersMessages() { + // Send messages to each active worker as needed + lock_guard lck(_mapMtx); + _activeWorkerMap.sendActiveWorkersMessages(); +} + } // namespace lsst::qserv::czar diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index 10e4af85d..e1e52a6e1 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -67,42 +67,16 @@ class CzarRegistry { ~CzarRegistry(); - /* &&& - struct WorkerContactInfo { - using Ptr = std::shared_ptr; - - WorkerContactInfo(std::string const& wId_, std::string const& wHost_, - std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) - : wId(wId_), - wHost(wHost_), - wManagementHost(wManagementHost_), - wPort(wPort_), - updateTime(updateTime_) {} - std::string const wId; ///< key - std::string const wHost; ///< "host-addr" entry. - std::string const wManagementHost; ///< "management-host-name" entry. - int const wPort; ///< "management-port" entry. - TIMEPOINT const updateTime; ///< "update-time-ms" entry. - - /// Return true if all members, aside from updateTime, are equal. - bool sameContactInfo(WorkerContactInfo const& other) const { - return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && - wPort == other.wPort); - } - std::string dump() const; - }; - */ - - using WorkerContactMap = std::unordered_map; - using WorkerContactMapPtr = std::shared_ptr; - /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. - WorkerContactMapPtr getWorkerContactMap() { + http::WorkerContactInfo::WCMapPtr getWorkerContactMap() { std::lock_guard lockG(_mapMtx); return _contactMap; } + /// &&& doc + void sendActiveWorkersMessages(); + private: CzarRegistry() = delete; CzarRegistry(std::shared_ptr const& czarConfig); @@ -118,10 +92,10 @@ class CzarRegistry { void _registryWorkerInfoLoop(); /// Build a new WorkerContactMap from the json `response` - WorkerContactMapPtr _buildMapFromJson(nlohmann::json const& response); + http::WorkerContactInfo::WCMapPtr _buildMapFromJson(nlohmann::json const& response); - /// Return true if maps are the same size and all of the elements are the same(). - bool _compareMap(WorkerContactMap const& other) const; + /// Return true if maps are the same size and all of the elements have the same contact info. + bool _compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const; std::shared_ptr const _czarConfig; ///< Pointer to the CzarConfig. @@ -130,9 +104,12 @@ class CzarRegistry { std::thread _czarWorkerInfoThrd; ///< This thread continuously collects worker contact information. /// Pointer to the map of worker contact information. - WorkerContactMapPtr _contactMap; - TIMEPOINT _latestUpdate; ///< The last time the _contactMap was updated. - std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate. + http::WorkerContactInfo::WCMapPtr _contactMap; + TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to WorkerContactInfo update. + // &&& review how this _mapMtx is used, probably locks for too long a period. + std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap + + ActiveWorkerMap _activeWorkerMap; ///< Map of workers czar considers active. }; } // namespace lsst::qserv::czar diff --git a/src/global/intTypes.h b/src/global/intTypes.h index c3a6f7fb0..f4b4197f7 100644 --- a/src/global/intTypes.h +++ b/src/global/intTypes.h @@ -39,6 +39,7 @@ typedef std::vector Int32Vector; typedef std::uint64_t QueryId; typedef std::int64_t JobId; typedef JobId UberJobId; // These must be the same type. +typedef std::uint32_t CzarIdType; // TODO:UJ remove qmeta::CzarId and rename this CzarId /// Class to provide a consistent format for QueryIds in the log file class QueryIdHelper { diff --git a/src/http/CMakeLists.txt b/src/http/CMakeLists.txt index 454d4ab88..61097f9f2 100644 --- a/src/http/CMakeLists.txt +++ b/src/http/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources(http PRIVATE RequestBodyJSON.cc RequestQuery.cc Url.cc + WorkerQueryStatusData.cc ) target_link_libraries(http PUBLIC @@ -51,5 +52,6 @@ http_tests( testAsyncReq testRequestBodyJSON testRequestQuery + testStatusData testUrl ) diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc new file mode 100644 index 000000000..cd254f7c0 --- /dev/null +++ b/src/http/WorkerQueryStatusData.cc @@ -0,0 +1,331 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "http/WorkerQueryStatusData.h" + +// System headers +#include + +// Qserv headers +#include "http/Client.h" +#include "http/MetaModule.h" +#include "http/RequestBodyJSON.h" +#include "util/common.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; +using namespace nlohmann; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.http.WorkerQueryStatusData"); +} // namespace + +namespace lsst::qserv::http { + +json CzarContactInfo::serializeJson() const { + json jsCzar; + jsCzar["name"] = czName; + jsCzar["id"]= czId; + jsCzar["management-port"] = czPort; + jsCzar["management-host-name"] = czHostName; + return jsCzar; +} + +CzarContactInfo::Ptr CzarContactInfo::createJson(nlohmann::json const& czJson) { + try { + auto czName_ = RequestBodyJSON::required(czJson, "name"); + auto czId_ = RequestBodyJSON::required(czJson, "id"); + auto czPort_ = RequestBodyJSON::required(czJson, "management-port"); + auto czHostName_ = RequestBodyJSON::required(czJson, "management-host-name"); + return create(czName_, czId_, czPort_, czHostName_); + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("CzarContactInfo::createJson invalid ") << exc.what()); + } + return nullptr; +} + +std::string CzarContactInfo::dump() const { + stringstream os; + os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << czHostName; + return os.str(); +} + + + +json WorkerContactInfo::serializeJson() const { + json jsWorker; + jsWorker["id"]= wId; + jsWorker["host"] = wHost; + jsWorker["management-host-name"] = wManagementHost; + jsWorker["management-port"] = wPort; + return jsWorker; +} + +WorkerContactInfo::Ptr WorkerContactInfo::createJson(nlohmann::json const& wJson, TIMEPOINT updateTime_) { + LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& a"); + try { + auto wId_ = RequestBodyJSON::required(wJson, "id"); + LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& b"); + auto wHost_ = RequestBodyJSON::required(wJson, "host"); + LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& c"); + auto wManagementHost_ = RequestBodyJSON::required(wJson, "management-host-name"); + LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& d"); + auto wPort_ = RequestBodyJSON::required(wJson, "management-port"); + LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& e"); + return create(wId_, wHost_, wManagementHost_, wPort_, updateTime_); + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("CWorkerContactInfo::createJson invalid ") << exc.what()); + } + return nullptr; +} + + + +string WorkerContactInfo::dump() const { + stringstream os; + os << "workerContactInfo{" + << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; + return os.str(); +} + +/* &&& +string ActiveWorker::getStateStr(State st) { + switch (st) { + case ALIVE: return string("ALIVE"); + case QUESTIONABLE: return string("QUESTIONABLE"); + case DEAD: return string("DEAD"); + } + return string("unknown"); +} + + +bool WorkerQueryStatusData::compareContactInfo(WorkerContactInfo const& wcInfo) const { + return _wInfo->isSameContactInfo(wcInfo); +} + +void WorkerQueryStatusData::setWorkerContactInfo(WorkerContactInfo::Ptr const& wcInfo) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " new info=" << wcInfo->dump()); + _wInfo = wcInfo; +} +*/ + + +shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { + + // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a + // message to send to the worker. + auto now = CLOCK::now(); + //&&&auto const czarConfig = cconfig::CzarConfig::instance(); + + shared_ptr jsWorkerReqPtr = make_shared(); + json& jsWorkerR = *jsWorkerReqPtr; + jsWorkerR["version"] = http::MetaModule::version; + /* &&& + jsWorkerR["instance_id"] = czarConfig->replicationInstanceId(); + jsWorkerR["auth_key"] = czarConfig->replicationAuthKey(); + */ + jsWorkerR["instance_id"] = _replicationInstanceId; + jsWorkerR["auth_key"] = _replicationAuthKey; + //&&&jsWorkerR["worker"] = _wInfo->wId; + jsWorkerR["qiddonekeepfiles"] = json::array(); + jsWorkerR["qiddonedeletefiles"] = json::array(); + jsWorkerR["qiddeaduberjobs"] = json::array(); + //&&&jsWorkerR["czar"] = json::object(); + jsWorkerR["czar"] = _czInfo->serializeJson(); + //&&&jsWorkerR["worker"] = json::object(); + jsWorkerR["worker"] = _wInfo->serializeJson(); + + + { + auto& jsDoneKeep = jsWorkerR["qiddonekeepfiles"]; + auto iterDoneKeep = _qIdDoneKeepFiles.begin(); + while (iterDoneKeep != _qIdDoneKeepFiles.end()) { + auto qId = iterDoneKeep->first; + jsDoneKeep.push_back(qId); + auto tmStamp = iterDoneKeep->second; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterDoneKeep = _qIdDoneKeepFiles.erase(iterDoneKeep); + } else { + ++iterDoneKeep; + } + } + } + { + auto& jsDoneDelete = jsWorkerR["qiddonedeletefiles"]; + auto iterDoneDelete = _qIdDoneDeleteFiles.begin(); + while (iterDoneDelete != _qIdDoneDeleteFiles.end()) { + auto qId = iterDoneDelete->first; + jsDoneDelete.push_back(qId); + auto tmStamp = iterDoneDelete->second; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterDoneDelete = _qIdDoneDeleteFiles.erase(iterDoneDelete); + } else { + ++iterDoneDelete; + } + } + } + { + auto& jsDeadUj = jsWorkerR["qiddeaduberjobs"]; + auto iterDeadUjQid = _qIdDeadUberJobs.begin(); + while (iterDeadUjQid != _qIdDeadUberJobs.end()) { + TIMEPOINT oldestTm; // default is zero + auto qId = iterDeadUjQid->first; + auto& ujIdMap = iterDeadUjQid->second; + + json jsQidUj = {{"qid", qId}, {"ujids", json::array()}}; + auto& jsUjIds = jsQidUj["ujids"]; + + auto iterUjId = ujIdMap.begin(); + bool addedUjId = false; + while (iterUjId != ujIdMap.end()) { + UberJobId ujId = iterUjId->first; + auto tmStamp = iterUjId->second; + if (tmStamp > oldestTm) { + oldestTm = tmStamp; + } + + jsUjIds.push_back(ujId); + addedUjId = true; + double ageSecs = std::chrono::duration(now - tmStamp).count(); + if (ageSecs > maxLifetime) { + iterUjId = ujIdMap.erase(iterUjId); + } else { + ++iterUjId; + } + } + + if (addedUjId) { + jsDeadUj.push_back(jsQidUj); + } + + if (ujIdMap.empty() + || std::chrono::duration(now - oldestTm).count() > maxLifetime) { + iterDeadUjQid = _qIdDeadUberJobs.erase(iterDeadUjQid); + } else { + ++iterDeadUjQid; + } + } + } + + /* &&& happens in the caller now. + // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) + // put this in a different function and start the thread.&&&; + _sendStatusMsg(jsWorkerReqPtr); + */ + return jsWorkerReqPtr; +} + +WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json const& jsWorkerReq, + std::string const& replicationInstanceId, std::string const& replicationAuthKey, TIMEPOINT updateTm) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& a"); + try { + if (jsWorkerReq["version"] != http::MetaModule::version) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson bad version"); + return nullptr; + } + + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& b"); + auto czInfo_ = CzarContactInfo::createJson(jsWorkerReq["czar"]); + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& c"); + auto wInfo_ = WorkerContactInfo::createJson(jsWorkerReq["worker"], updateTm); + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& d"); + if (czInfo_ == nullptr || wInfo_ == nullptr) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " << jsWorkerReq); + } + auto wqsData = WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId, replicationAuthKey); + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& e"); + + auto parseRes = wqsData->_parseLists(jsWorkerReq, updateTm); + if (!parseRes) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson error reading lists in " << jsWorkerReq); + return nullptr; + } + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& end"); + return wqsData; + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::createJson invalid ") << exc.what()); + } + return nullptr; +} + +bool WorkerQueryStatusData::_parseLists(nlohmann::json const& jsWorkerReq, TIMEPOINT updateTm) { + try { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& a"); + auto& jsQIdDoneKeepFiles = jsWorkerReq["qiddonekeepfiles"]; + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& b"); + for (auto const& qidKeep : jsQIdDoneKeepFiles) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& b1"); + _qIdDoneKeepFiles[qidKeep] = updateTm; + } + + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& c"); + auto& jsQIdDoneDeleteFiles = jsWorkerReq["qiddonedeletefiles"]; + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& d"); + for (auto const& qidDelete : jsQIdDoneDeleteFiles) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& d1"); + _qIdDoneDeleteFiles[qidDelete] = updateTm; + } + + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& e"); + auto& jsQIdDeadUberJobs = jsWorkerReq["qiddeaduberjobs"]; + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f jsQIdDeadUberJobs=" << jsQIdDeadUberJobs); + // Interestingly, !jsQIdDeadUberJobs.empty() doesn't work, but .size() > 0 does. + // Not having the size() check causes issues with the for loop trying to read the + // first element of an empty list, which goes badly. + if (jsQIdDeadUberJobs.size() > 0) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1"); + for (auto const& qDeadUjs : jsQIdDeadUberJobs) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1a qDeadUjs=" << qDeadUjs); + QueryId qId = qDeadUjs["qid"]; + auto const& ujIds = qDeadUjs["ujids"]; + auto& mapOfUj = _qIdDeadUberJobs[qId]; + for (auto const& ujId : ujIds) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1d1 qId=" << qId << " ujId=" << ujId); + mapOfUj[ujId] = updateTm; + } + } + } + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::_parseLists invalid ") << exc.what()); + return false; + } + return true; +} + +void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm) { + auto& ujMap = _qIdDeadUberJobs[qId]; + for (auto const ujId : ujIds) { + ujMap[ujId] = tm; + } +} + +string WorkerQueryStatusData::dump() const { + stringstream os; + os << "ActiveWorker " << ((_wInfo == nullptr) ? "?" : _wInfo->dump()); + return os.str(); +} + +} // namespace lsst::qserv::czar diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h new file mode 100644 index 000000000..f0f6c1aaa --- /dev/null +++ b/src/http/WorkerQueryStatusData.h @@ -0,0 +1,234 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H +#define LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H + +// System headers +#include +#include +#include +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// qserv headers +#include "global/clock_defs.h" +#include "global/intTypes.h" + + +// This header declarations +namespace lsst::qserv::http { + +/// &&& doc +class CzarContactInfo { +public: + using Ptr = std::shared_ptr; + std::string cName(const char* fnc) const { + return std::string("CzarContactInfo") + fnc; + } + + CzarContactInfo() = delete; + CzarContactInfo(CzarContactInfo const&) = default; + CzarContactInfo& operator=(CzarContactInfo const&) = default; + + /// &&& doc + bool compare(CzarContactInfo const& other) { + return (czName == other.czName && czId == other.czId && czPort == other.czPort && czHostName == other.czHostName); + } + + static Ptr create(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_) { + return Ptr(new CzarContactInfo(czName_, czId_, czPort_, czHostName_)); + } + + static Ptr createJson(nlohmann::json const& czarJson); + + std::string const czName; ///< czar "name" + CzarIdType const czId; ///< czar "id" + int const czPort; ///< czar "management-port" + std::string const czHostName; ///< czar "management-host-name" + + /// &&& doc + nlohmann::json serializeJson() const; + + /// &&& doc + //&&&bool parse(nlohmann::json const& czarJson); + + std::string dump() const; + /* &&& + auto& jsWCzar = jsWorkerR["czar"]; + jsWCzar["name"] = czarConfig->name(); + jsWCzar["id"]= czarConfig->id(); + jsWCzar["management-port"] = czarConfig->replicationHttpPort(); + jsWCzar["management-host-name"] = util::get_current_host_fqdn(); + */ +private: + CzarContactInfo(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_) + : czName(czName_), czId(czId_), czPort(czPort_), czHostName(czHostName_) {} +}; + + +/// &&& doc This class just contains the worker id and network communication +/// information, but it may be desirable to store connections to the +/// worker here as well. +class WorkerContactInfo { +public: + using Ptr = std::shared_ptr; + + using WCMap = std::unordered_map; + using WCMapPtr = std::shared_ptr; + + static Ptr create(std::string const& wId_, std::string const& wHost_, + std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) { + return Ptr(new WorkerContactInfo(wId_, wHost_, + wManagementHost_, wPort_, updateTime_)); + } + + /// &&& doc + static Ptr createJson(nlohmann::json const& workerJson, TIMEPOINT updateTime); + + /// &&& doc + nlohmann::json serializeJson() const; + + std::string cName(const char* fn) { return std::string("WorkerContactInfo::") + fn; } + + /// &&& make private + WorkerContactInfo(std::string const& wId_, std::string const& wHost_, + std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) + : wId(wId_), + wHost(wHost_), + wManagementHost(wManagementHost_), + wPort(wPort_) { + regUpdateTime(updateTime_); + } + std::string const wId; ///< key + std::string const wHost; ///< "host-addr" entry. + std::string const wManagementHost; ///< "management-host-name" entry. + int const wPort; ///< "management-port" entry. + + + /// Return true if all members, aside from updateTime, are equal. + bool isSameContactInfo(WorkerContactInfo const& other) const { + return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && + wPort == other.wPort); + } + + void regUpdateTime(TIMEPOINT updateTime) { + std::lock_guard lg(_rMtx); + _regUpdate = updateTime; + } + + double timeSinceRegUpdateSeconds() const { + std::lock_guard lg(_rMtx); + double secs = std::chrono::duration(CLOCK::now() - _regUpdate).count(); + return secs; + } + + TIMEPOINT getRegUpdate() const { + std::lock_guard lg(_rMtx); + return _regUpdate; + } + + std::string dump() const; + +private: + /// Last time the registry heard from this worker. The ActiveWorker class + /// will use this to determine the worker's state. + /// &&& Store in seconds since epoch to make atomic? + TIMEPOINT _regUpdate; + + mutable std::mutex _rMtx; ///< protects _regUpdate +}; + + +/// &&& doc +class WorkerQueryStatusData { +public: + using Ptr = std::shared_ptr; + + /* &&& + enum State { + ALIVE = 0, + QUESTIONABLE, + DEAD + }; + */ + + WorkerQueryStatusData() = delete; + WorkerQueryStatusData(WorkerQueryStatusData const&) = delete; + WorkerQueryStatusData& operator=(WorkerQueryStatusData const&) = delete; + + std::string cName(const char* fName) { + return std::string("WorkerQueryStatusData::") + fName + " " + ((_wInfo == nullptr) ? "?" : _wInfo->wId); + } + + //&&&static std::string getStateStr(State st); + + static Ptr create(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + return Ptr(new WorkerQueryStatusData(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); + } + + /// &&& doc + static Ptr createJson(nlohmann::json const& czarJson, + std::string const& replicationInstanceId, std::string const& replicationAuthKey, TIMEPOINT updateTm); + + + ~WorkerQueryStatusData() = default; + + WorkerContactInfo::Ptr getWInfo() const { return _wInfo; } + + /// &&& doc + void addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm); + + std::string dump() const; + +//&&&private: + WorkerQueryStatusData(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, std::string const& replicationAuthKey) + : _wInfo(wInfo), _czInfo(czInfo), + _replicationInstanceId(replicationInstanceId), _replicationAuthKey(replicationAuthKey) {} + + std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached + std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + std::map> _qIdDeadUberJobs; ///< &&& doc + + /// &&& TODO:UJ Worth the effort to inform worker of killed UberJobs? + //std::map> _killedUberJobs; + + WorkerContactInfo::Ptr _wInfo; ///< &&& doc + CzarContactInfo::Ptr _czInfo; //< &&& doc + + std::string const _replicationInstanceId; ///< &&& doc + std::string const _replicationAuthKey; ///< &&& doc + + /// &&& doc + std::shared_ptr serializeJson(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime); + + /// &&& doc + /// @throws std::invalid_argument + bool _parseLists(nlohmann::json const& jsWorkerReq, TIMEPOINT updateTm); // &&& delete after basic testing +}; + +} // namespace lsst::qserv::http + +#endif // LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc new file mode 100644 index 000000000..97767dd9f --- /dev/null +++ b/src/http/testStatusData.cc @@ -0,0 +1,140 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// System headers +#include +#include +#include +#include +#include + +// Qserv headers +#include "global/clock_defs.h" +#include "http/WorkerQueryStatusData.h" + +// LSST headers +#include "lsst/log/Log.h" + +// Boost unit test header +#define BOOST_TEST_MODULE RequestQuery +#include + +using namespace std; +namespace test = boost::test_tools; +using namespace lsst::qserv::http; + +BOOST_AUTO_TEST_SUITE(Suite) + +BOOST_AUTO_TEST_CASE(CzarContactInfo) { + + string const replicationInstanceId = "repliInstId"; + string const replicationAuthKey = "repliIAuthKey"; + + string const cName("czar_name"); + lsst::qserv::CzarIdType const cId = 32; + int cPort = 2022; + string const cHost("cz_host"); + + auto czarA = lsst::qserv::http::CzarContactInfo::create(cName, cId, cPort, cHost); + LOGS_ERROR("&&& a czarA=" << czarA->dump()); + + auto czarAJs = czarA->serializeJson(); + LOGS_ERROR("&&& b czarAJs=" << czarAJs); + + auto czarB = lsst::qserv::http::CzarContactInfo::createJson(czarAJs); + LOGS_ERROR("&&& c czarB=" << czarB); + BOOST_REQUIRE(czarA->compare(*czarB)); + + auto czarC = lsst::qserv::http::CzarContactInfo::create("different", cId, cPort, cHost); + BOOST_REQUIRE(!czarA->compare(*czarC)); + + auto start = lsst::qserv::CLOCK::now(); + auto workerA = WorkerContactInfo::create("sd_workerA", "host_w1", "mgmhost_a", 3421, start); + auto workerB = WorkerContactInfo::create("sd_workerB", "host_w2", "mgmhost_a", 3421, start); + auto workerC = WorkerContactInfo::create("sd_workerC", "host_w3", "mgmhost_b", 3422, start); + LOGS_ERROR("&&& d workerA=" << workerA->dump()); + + auto jsWorkerA = workerA->serializeJson(); + LOGS_ERROR("&&& e jsWorkerA=" << jsWorkerA); + auto start1Sec = start + 1s; + auto workerA1 = WorkerContactInfo::createJson(jsWorkerA, start1Sec); + LOGS_ERROR("&&& f workerA1=" << workerA1->dump()); + BOOST_REQUIRE(workerA->isSameContactInfo(*workerA1)); + + // WorkerQueryStatusData + auto wqsdA = lsst::qserv::http::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, replicationAuthKey); + LOGS_ERROR("&&& g wqsdA=" << wqsdA->dump()); + + double timeoutAliveSecs = 100.0; + double timeoutDeadSecs = 2*timeoutAliveSecs; + double maxLifetime = 300.0; + auto jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + LOGS_ERROR("&&& h jsDataA=" << *jsDataA); + + // Check that empty lists work. + auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start1Sec); + LOGS_ERROR("&&& i wqsdA1=" << wqsdA1->dump()); + auto jsDataA1 = wqsdA1->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + BOOST_REQUIRE(*jsDataA == *jsDataA1); + + + vector qIdsDelFiles = { 7, 8, 9, 15, 25, 26, 27, 30 }; + vector qIdsKeepFiles = { 1, 2, 3, 4, 6, 10, 13, 19, 33 }; + for (auto const qIdDF : qIdsDelFiles) { + wqsdA->_qIdDoneDeleteFiles[qIdDF] = start; + } + + jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + LOGS_ERROR("&&& j jsDataA=" << jsDataA); + BOOST_REQUIRE(*jsDataA != *jsDataA1); + + for (auto const qIdKF : qIdsKeepFiles) { + wqsdA->_qIdDoneKeepFiles[qIdKF] = start; + } + + wqsdA->addDeadUberJobs(12, {1, 3}, start); + + LOGS_ERROR("&&& i wqsdA=" << wqsdA->dump()); + + jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + LOGS_ERROR("&&& j jsDataA=" << *jsDataA); + + auto start5Sec = start + 5s; + auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); + auto jsWorkerAFromJson = workerAFromJson->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); + + wqsdA->addDeadUberJobs(12, {34}, start5Sec); + wqsdA->addDeadUberJobs(91, {77}, start5Sec); + wqsdA->addDeadUberJobs(1059, {1, 4, 6, 7, 8, 10, 3, 22, 93}, start5Sec); + + jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + LOGS_ERROR("&&& k jsDataA=" << *jsDataA); + BOOST_REQUIRE(*jsDataA != *jsWorkerAFromJson); + + workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); + jsWorkerAFromJson = workerAFromJson->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + LOGS_ERROR("&&& l jsWorkerAFromJson=" << *jsWorkerAFromJson); + BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); + +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 66e631be1..06f45ba72 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -67,7 +67,7 @@ class UberJob : public JobBase { return _uberJobId; } // TODO:UJ change name when JobBase no longer needed. std::string const& getIdStr() const override { return _idStr; } - std::shared_ptr getQdispPool() override { return _qdispPool; } // TODO:UJ relocate to JobBase + std::shared_ptr getQdispPool() override { return _qdispPool; } //&&&std::string const& getPayload() const override { return _payload; } // TODO:UJ delete when possible. std::shared_ptr getRespHandler() override { return _respHandler; } std::shared_ptr getStatus() override { @@ -95,7 +95,7 @@ class UberJob : public JobBase { /// Set the worker information needed to send messages to the worker believed to /// be responsible for the chunks handled in this UberJob. - void setWorkerContactInfo(czar::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? + void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? _wContactInfo = wContactInfo; } @@ -159,7 +159,7 @@ class UberJob : public JobBase { czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed // Contact information for the target worker. - czar::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? + http::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? }; } // namespace lsst::qserv::qdisp diff --git a/src/qmeta/types.h b/src/qmeta/types.h index 28e8338fa..6f55562b3 100644 --- a/src/qmeta/types.h +++ b/src/qmeta/types.h @@ -38,7 +38,7 @@ namespace lsst::qserv::qmeta { */ /// Typedef for Czar ID in query metadata. -typedef std::uint32_t CzarId; +typedef CzarIdType CzarId; // uint32_t TODO:UJ Replace qmeta::CzarId with global } // namespace lsst::qserv::qmeta diff --git a/src/xrdsvc/HttpSvc.cc b/src/xrdsvc/HttpSvc.cc index 49781fc24..392f5e6b8 100644 --- a/src/xrdsvc/HttpSvc.cc +++ b/src/xrdsvc/HttpSvc.cc @@ -141,6 +141,12 @@ uint16_t HttpSvc::start() { HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "QUERYJOB", http::AuthType::REQUIRED); }}}); + _httpServerPtr->addHandlers( + {{"POST", "/querystatus", + [self](shared_ptr const& req, shared_ptr const& resp) { + HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "QUERYSTATUS", + http::AuthType::REQUIRED); + }}}); _httpServerPtr->start(); // Initialize the I/O context and start the service threads. At this point diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index af6f741da..856bd4455 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -89,6 +89,7 @@ json HttpWorkerCzarModule::executeImpl(string const& subModuleName) { enforceInstanceId(func, wconfig::WorkerConfig::instance()->replicationInstanceId()); enforceWorkerId(func); if (subModuleName == "QUERYJOB") return _queryJob(); + if (subModuleName == "QUERYSTATUS") return _queryStatus(); throw invalid_argument(context() + func + " unsupported sub-module"); } @@ -210,4 +211,18 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { return jsRet; } +json HttpWorkerCzarModule::_queryStatus() { + debug(__func__); + checkApiVersion(__func__, 34); + // At this point, API version, correct worker, and auth have been checked. + json jsRet = _handleQueryStatus(__func__); + return jsRet; +} + +json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { + LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE HttpWorkerCzarModule::_handleQueryStatus"); + throw util::Bug(ERR_LOC, "&&& NEED CODE HttpWorkerCzarModule::_handleQueryStatus"); +} + + } // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/HttpWorkerCzarModule.h b/src/xrdsvc/HttpWorkerCzarModule.h index bb75a63c5..94b7f934a 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.h +++ b/src/xrdsvc/HttpWorkerCzarModule.h @@ -81,6 +81,12 @@ class HttpWorkerCzarModule : public xrdsvc::HttpModule { /// Handle an UberJob message from the czar to run it on this worker, this does /// work of deciphering the message, creating UberJobData objects and Task objects. nlohmann::json _handleQueryJob(std::string const& func); + + /// &&& doc + nlohmann::json _queryStatus(); + + /// &&& doc + nlohmann::json _handleQueryStatus(std::string const& func); }; } // namespace lsst::qserv::xrdsvc From ac9afad3230db06f07b68d9e4b0c37ccfa3fe4a8 Mon Sep 17 00:00:00 2001 From: John Gates Date: Tue, 3 Sep 2024 11:52:38 -0700 Subject: [PATCH 05/22] Added cancellation code and for queries, uberjobs, and czar restart. --- src/ccontrol/UserQuerySelect.cc | 10 +- src/czar/ActiveWorker.cc | 284 ++++++++++------------------ src/czar/ActiveWorker.h | 148 ++++++--------- src/czar/Czar.cc | 23 ++- src/czar/CzarChunkMap.cc | 6 +- src/czar/CzarRegistry.cc | 21 +- src/czar/CzarRegistry.h | 8 +- src/global/ResourceUnit.h | 2 +- src/global/intTypes.h | 4 +- src/http/WorkerQueryStatusData.cc | 230 ++++++++++++++-------- src/http/WorkerQueryStatusData.h | 142 ++++++++------ src/http/testStatusData.cc | 61 +++--- src/proto/ScanTableInfo.h | 2 +- src/proto/worker.proto | 2 +- src/qdisp/Executive.cc | 8 +- src/qdisp/Executive.h | 8 +- src/qdisp/JobBase.h | 3 +- src/qdisp/JobDescription.cc | 4 +- src/qdisp/JobQuery.cc | 2 +- src/qdisp/UberJob.h | 5 +- src/qdisp/testQDisp.cc | 6 +- src/wbase/FileChannelShared.cc | 15 +- src/wbase/FileChannelShared.h | 2 +- src/wbase/MsgProcessor.h | 10 +- src/wbase/Task.cc | 63 +++++- src/wbase/Task.h | 22 ++- src/wbase/UberJobData.cc | 8 + src/wbase/UberJobData.h | 6 + src/wbase/UserQueryInfo.cc | 76 ++++---- src/wbase/UserQueryInfo.h | 40 ++-- src/wcontrol/Foreman.cc | 2 + src/wcontrol/Foreman.h | 8 +- src/wdb/QueryRunner.cc | 3 +- src/wdb/testQueryRunner.cc | 32 ++-- src/wpublish/QueriesAndChunks.cc | 20 +- src/wpublish/QueriesAndChunks.h | 7 +- src/wpublish/QueryStatistics.cc | 10 +- src/wpublish/QueryStatistics.h | 12 +- src/wsched/testSchedulers.cc | 23 ++- src/xrdreq/QueryManagementAction.h | 2 +- src/xrdreq/QueryManagementRequest.h | 2 +- src/xrdsvc/ChannelStream.h | 2 +- src/xrdsvc/HttpSvc.cc | 4 +- src/xrdsvc/HttpWorkerCzarModule.cc | 119 +++++++++++- src/xrdsvc/SsiRequest.cc | 13 +- src/xrdsvc/SsiRequest.h | 3 +- 46 files changed, 862 insertions(+), 621 deletions(-) diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index d7d4fb5a0..7627fb960 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -364,8 +364,6 @@ void UserQuerySelect::buildAndSendUberJobs() { // Make a map of all jobs in the executive. // TODO:UJ Maybe a check should be made that all databases are in the same family? - - // keep cycling through workers until no more chunks to place. // - create a map of UberJobs key=, val=> // - for chunkId in `unassignedChunksInQuery` @@ -509,7 +507,7 @@ QueryState UserQuerySelect::join() { if (finalRows < 0) finalRows = collectedRows; // Notify workers on the query completion/cancellation to ensure // resources are properly cleaned over there as well. - proto::QueryManagement::Operation operation = proto::QueryManagement::COMPLETE; + proto::QueryManagement::Operation operation = proto::QueryManagement::COMPLETE; //&&&QM QueryState state = SUCCESS; if (successful) { _qMetaUpdateStatus(qmeta::QInfo::COMPLETED, collectedRows, collectedBytes, finalRows); @@ -517,18 +515,18 @@ QueryState UserQuerySelect::join() { } else if (_killed) { // status is already set to ABORTED LOGS(_log, LOG_LVL_ERROR, "Joined everything (killed)"); - operation = proto::QueryManagement::CANCEL; + operation = proto::QueryManagement::CANCEL; //&&&QM state = ERROR; } else { _qMetaUpdateStatus(qmeta::QInfo::FAILED, collectedRows, collectedBytes, finalRows); LOGS(_log, LOG_LVL_ERROR, "Joined everything (failure!)"); - operation = proto::QueryManagement::CANCEL; + operation = proto::QueryManagement::CANCEL; //&&&QM state = ERROR; } auto const czarConfig = cconfig::CzarConfig::instance(); if (czarConfig->notifyWorkersOnQueryFinish()) { try { - // &&& do this another way, also see executive::squash + // &&& do this another way, also see executive::squash &&&QM xrdreq::QueryManagementAction::notifyAllWorkers(czarConfig->getXrootdFrontendUrl(), operation, _qMetaCzarId, _qMetaQueryId); } catch (std::exception const& ex) { diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 78b7d04f0..39aa042ef 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -54,9 +54,12 @@ string WorkerContactInfo::dump() const { string ActiveWorker::getStateStr(State st) { switch (st) { - case ALIVE: return string("ALIVE"); - case QUESTIONABLE: return string("QUESTIONABLE"); - case DEAD: return string("DEAD"); + case ALIVE: + return string("ALIVE"); + case QUESTIONABLE: + return string("QUESTIONABLE"); + case DEAD: + return string("DEAD"); } return string("unknown"); } @@ -74,202 +77,74 @@ void ActiveWorker::setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcIn void ActiveWorker::_changeStateTo(State newState, double secsSinceUpdate, string const& note) { auto lLvl = (newState == DEAD) ? LOG_LVL_ERROR : LOG_LVL_INFO; - LOGS(_log, lLvl, note << " oldState=" << getStateStr(_state) << " newState=" << getStateStr(newState) << " secsSince=" << secsSinceUpdate); + LOGS(_log, lLvl, + note << " oldState=" << getStateStr(_state) << " newState=" << getStateStr(newState) + << " secsSince=" << secsSinceUpdate); _state = newState; } -void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { - // &&& function too long +void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, + double maxLifetime) { lock_guard lg(_aMtx); double secsSinceUpdate = _wqsData->_wInfo->timeSinceRegUpdateSeconds(); // Update the last time the registry contacted this worker. switch (_state) { - case ALIVE: { - if (secsSinceUpdate > timeoutAliveSecs) { - _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); - // Anything that should be done here? - } - break; - } - case QUESTIONABLE: { - if (secsSinceUpdate < timeoutAliveSecs) { - _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); - } - if (secsSinceUpdate > timeoutDeadSecs) { - _changeStateTo(DEAD, secsSinceUpdate, cName(__func__)); - // &&& TODO:UJ all uberjobs for this worker need to die. - } - break; - } - case DEAD: { - LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE"); - if (secsSinceUpdate < timeoutAliveSecs) { - _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); - } else { - // Don't waste time on this worker until the registry has heard from it. - return; + case ALIVE: { + if (secsSinceUpdate > timeoutAliveSecs) { + _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); + // &&& Anything else that should be done here? + } + break; } - break; - } - - } - - // Check how many messages are currently being sent to the worker, if at the limit, return - if (_wqsData->_qIdDoneKeepFiles.empty() && _wqsData->_qIdDoneDeleteFiles.empty() && _wqsData->_qIdDeadUberJobs.empty()) { - return; - } - int tCount = _conThreadCount; - if (tCount > _maxConThreadCount) { - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " not sending message since at max threads " << tCount); - return; - } - - // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a - // message to send to the worker. -#if 0 // &&& - auto now = CLOCK::now(); - auto const czarConfig = cconfig::CzarConfig::instance(); - - shared_ptr jsWorkerReqPtr = make_shared(); - json& jsWorkerR = *jsWorkerReqPtr; - jsWorkerR["version"] = http::MetaModule::version; - jsWorkerR["instance_id"] = czarConfig->replicationInstanceId(); - jsWorkerR["auth_key"] = czarConfig->replicationAuthKey(); - jsWorkerR["worker"] = _wInfo->wId; - jsWorkerR["qiddonekeepfiles"] = json::array(); - jsWorkerR["qiddonedeletefiles"] = json::array(); - jsWorkerR["qiddeaduberjobs"] = json::array(); - jsWorkerR["czar"] = json::object(); - auto& jsWCzar = jsWorkerR["czar"]; - jsWCzar["name"] = czarConfig->name(); - jsWCzar["id"]= czarConfig->id(); - jsWCzar["management-port"] = czarConfig->replicationHttpPort(); - jsWCzar["management-host-name"] = util::get_current_host_fqdn(); - - - { - auto& jsDoneKeep = jsWorkerR["qiddonekeepfiles"]; - auto iterDoneKeep = _qIdDoneKeepFiles.begin(); - while (iterDoneKeep != _qIdDoneKeepFiles.end()) { - auto qId = iterDoneKeep->first; - jsDoneKeep.push_back(qId); - auto tmStamp = iterDoneKeep->second; - double ageSecs = std::chrono::duration(now - tmStamp).count(); - if (ageSecs > maxLifetime) { - iterDoneKeep = _qIdDoneKeepFiles.erase(iterDoneKeep); - } else { - ++iterDoneKeep; + case QUESTIONABLE: { + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + } + if (secsSinceUpdate > timeoutDeadSecs) { + _changeStateTo(DEAD, secsSinceUpdate, cName(__func__)); + // &&& TODO:UJ all uberjobs for this worker need to die. } + break; } - } - { - auto& jsDoneDelete = jsWorkerR["qiddonedeletefiles"]; - auto iterDoneDelete = _qIdDoneDeleteFiles.begin(); - while (iterDoneDelete != _qIdDoneDeleteFiles.end()) { - auto qId = iterDoneDelete->first; - jsDoneDelete.push_back(qId); - auto tmStamp = iterDoneDelete->second; - double ageSecs = std::chrono::duration(now - tmStamp).count(); - if (ageSecs > maxLifetime) { - iterDoneDelete = _qIdDoneDeleteFiles.erase(iterDoneDelete); + case DEAD: { + LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE"); + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); } else { - ++iterDoneDelete; + // Don't waste time on this worker until the registry has heard from it. + return; } + break; } } - { - auto& jsDeadUj = jsWorkerR["qiddeaduberjobs"]; - auto iterDeadUjQid = _qIdDeadUberJobs.begin(); - while (iterDeadUjQid != _qIdDeadUberJobs.end()) { - TIMEPOINT oldestTm; // default is zero - auto qId = iterDeadUjQid->first; - auto& ujIdMap = iterDeadUjQid->second; - - json jsQidUj = {{"qid", qId}, {"ujids", json::array()}}; - auto& jsUjIds = jsQidUj["ujids"]; - - auto iterUjId = ujIdMap.begin(); - bool addedUjId = false; - while (iterUjId != ujIdMap.end()) { - UberJobId ujId = iterUjId->first; - auto tmStamp = iterUjId->second; - if (tmStamp > oldestTm) { - oldestTm = tmStamp; - } - - jsUjIds.push_back(ujId); - addedUjId = true; - double ageSecs = std::chrono::duration(now - tmStamp).count(); - if (ageSecs > maxLifetime) { - iterUjId = ujIdMap.erase(iterUjId); - } else { - ++iterUjId; - } - } - - if (addedUjId) { - jsDeadUj.push_back(jsQidUj); - } - if (ujIdMap.empty() - || std::chrono::duration(now - oldestTm).count() > maxLifetime) { - iterDeadUjQid = _qIdDeadUberJobs.erase(iterDeadUjQid); - } else { - ++iterDeadUjQid; - } + shared_ptr jsWorkerReqPtr; + { + lock_guard mapLg(_wqsData->_mapMtx); + // Check how many messages are currently being sent to the worker, if at the limit, return + if (_wqsData->_qIdDoneKeepFiles.empty() && _wqsData->_qIdDoneDeleteFiles.empty() && + _wqsData->_qIdDeadUberJobs.empty()) { + return; + } + int tCount = _conThreadCount; + if (tCount > _maxConThreadCount) { + LOGS(_log, LOG_LVL_DEBUG, + cName(__func__) << " not sending message since at max threads " << tCount); + return; } - } -#endif // &&& - auto jsWorkerReqPtr = _wqsData->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a + // message to send to the worker. + jsWorkerReqPtr = _wqsData->serializeJson(maxLifetime); + } + // &&& Maybe only send the status message if the lists are not empty ??? // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) // put this in a different function and start the thread.&&&; _sendStatusMsg(jsWorkerReqPtr); } -#if 0 // &&& -bool ActiveWorker::_parse(nlohmann::json const& jsWorkerReq) { - auto const czarConfig = cconfig::CzarConfig::instance(); - - http::RequestBodyJSON rbWReq(jsWorkerReq); - if (jsWorkerReq["version"] != http::MetaModule::version) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " bad version"); - return false; - } - - - http::RequestBodyJSON rbCzar(rbWReq.required("czar")); - auto czarName = rbCzar.required("name"); - auto czarId = rbCzar.required("id"); - auto czarPort = rbCzar.required("management-port"); - auto czarHostName = rbCzar.required("management-host-name"); - /* &&& - jsWorkerReq["instance_id"] != czarConfig->replicationInstanceId(); - jsWorkerReq["auth_key"] != czarConfig->replicationAuthKey(); - jsWorkerReq["worker"] != _wInfo->wId; - auto& jsWCzar = jsWorkerReq["czar"]; - jsWCzar["name"] != czarConfig->name(); - jsWCzar["id"] != czarConfig->id(); - jsWCzar["management-port"] != czarConfig->replicationHttpPort(); - jsWCzar["management-host-name"] != util::get_current_host_fqdn(); - */ - - - auto& jsQIdDoneKeepFiles = jsWorkerReq["qiddonekeepfiles"]; - for (auto const& qidKeep : jsQIdDoneKeepFiles) { - - } - - auto& jsQIdDoneDeleteFiles = jsWorkerReq["qiddonedeletefiles"]; - - auto& jsQIdDeadUberJobs = jsWorkerReq["qiddeaduberjobs"]; - -} -#endif // &&& - void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorkerReqPtr) { - auto& jsWorkerReq = *jsWorkerReqPtr; auto const method = http::Method::POST; auto const& wInf = _wqsData->_wInfo; @@ -277,7 +152,6 @@ void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorke vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " REQ " << jsWorkerReq); string const requestContext = "Czar: '" + http::method2string(method) + "' stat request to '" + url + "'"; LOGS(_log, LOG_LVL_TRACE, @@ -289,7 +163,7 @@ void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorke try { json const response = client.readAsJson(); if (0 != response.at("success").get()) { - transmitSuccess = true; + transmitSuccess = _wqsData->handleResponseJson(response); } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " response success=0"); } @@ -299,12 +173,14 @@ void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorke } if (!transmitSuccess) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " transmit failure"); - } else { - // parse the return statement and remove the indicated entries from the list - //HERE &&&; } } +void ActiveWorker::addToDoneDeleteFiles(QueryId qId) { _wqsData->addToDoneDeleteFiles(qId); } + +void ActiveWorker::addToDoneKeepFiles(QueryId qId) { _wqsData->addToDoneKeepFiles(qId); } + +void ActiveWorker::removeDeadUberJobsFor(QueryId qId) { _wqsData->removeDeadUberJobsFor(qId); } string ActiveWorker::dump() const { lock_guard lg(_aMtx); @@ -317,8 +193,10 @@ string ActiveWorker::_dump() const { return os.str(); } - -void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey) { +void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, + http::CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, + std::string const& replicationAuthKey) { // Go through wcMap, update existing entries in _awMap, create new entries for those that don't exist, lock_guard awLg(_awMapMtx); for (auto const& [wcKey, wcVal] : wcMap) { @@ -326,11 +204,16 @@ void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, htt if (iter == _awMap.end()) { auto newAW = ActiveWorker::create(wcVal, czInfo, replicationInstanceId, replicationAuthKey); _awMap[wcKey] = newAW; + if (_czarCancelAfterRestart) { + newAW->setCzarCancelAfterRestart(_czarCancelAfterRestartCzId, _czarCancelAfterRestartQId); + } } else { auto aWorker = iter->second; if (!aWorker->compareContactInfo(*wcVal)) { // This should not happen, but try to handle it gracefully if it does. - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " worker contact info changed for " << wcKey << " new=" << wcVal->dump() << " old=" << aWorker->dump()); + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << " worker contact info changed for " << wcKey + << " new=" << wcVal->dump() << " old=" << aWorker->dump()); aWorker->setWorkerContactInfo(wcVal); } } @@ -351,13 +234,46 @@ void ActiveWorkerMap::pruneMap() { } */ +void ActiveWorkerMap::setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { + _czarCancelAfterRestart = true; + _czarCancelAfterRestartCzId = czId; + _czarCancelAfterRestartQId = lastQId; +} + void ActiveWorkerMap::sendActiveWorkersMessages() { // Send messages to each active worker as needed lock_guard lck(_awMapMtx); - for(auto&& [wName, awPtr] : _awMap) { + for (auto&& [wName, awPtr] : _awMap) { awPtr->updateStateAndSendMessages(_timeoutAliveSecs, _timeoutDeadSecs, _maxLifetime); } } +/// &&& doc +void ActiveWorkerMap::addToDoneDeleteFiles(QueryId qId) { + lock_guard lck(_awMapMtx); + for (auto const& [wName, awPtr] : _awMap) { + awPtr->addToDoneDeleteFiles(qId); + awPtr->removeDeadUberJobsFor(qId); + } +} + +/// &&& doc +void ActiveWorkerMap::addToDoneKeepFiles(QueryId qId) { + lock_guard lck(_awMapMtx); + for (auto const& [wName, awPtr] : _awMap) { + awPtr->addToDoneKeepFiles(qId); + awPtr->removeDeadUberJobsFor(qId); + } +} + +/* &&& +/// &&& doc +void ActiveWorkerMap::removeDeadUberJobsFor(QueryId qId) { + lock_guard lck(_awMapMtx); + for (auto const& [wName, awPtr] : _awMap) { + awPtr->removeDeadUberJobsFor(qId); + } +} +*/ } // namespace lsst::qserv::czar diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 0db7a0d76..0c05e0180 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -32,74 +32,12 @@ #include "nlohmann/json.hpp" // qserv headers -// &&& #include "global/clock_defs.h" -// &&& #include "global/intTypes.h" #include "http/WorkerQueryStatusData.h" - +#include "util/Bug.h" // This header declarations namespace lsst::qserv::czar { - -/* &&& -/// &&& doc This class just contains the worker id and network communication -/// information, but it may be desirable to store connections to the -/// worker here as well. -class WorkerContactInfo { -public: - using Ptr = std::shared_ptr; - - using WCMap = std::unordered_map; - using WCMapPtr = std::shared_ptr; - - WorkerContactInfo(std::string const& wId_, std::string const& wHost_, - std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) - : wId(wId_), - wHost(wHost_), - wManagementHost(wManagementHost_), - wPort(wPort_) { - regUpdateTime(updateTime_); - } - std::string const wId; ///< key - std::string const wHost; ///< "host-addr" entry. - std::string const wManagementHost; ///< "management-host-name" entry. - int const wPort; ///< "management-port" entry. - - - /// Return true if all members, aside from updateTime, are equal. - bool isSameContactInfo(WorkerContactInfo const& other) const { - return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && - wPort == other.wPort); - } - - void regUpdateTime(TIMEPOINT updateTime) { - std::lock_guard lg(_rMtx); - _regUpdate = updateTime; - } - - double timeSinceRegUpdateSeconds() const { - std::lock_guard lg(_rMtx); - double secs = std::chrono::duration(CLOCK::now() - _regUpdate).count(); - return secs; - } - - TIMEPOINT getRegUpdate() const { - std::lock_guard lg(_rMtx); - return _regUpdate; - } - - std::string dump() const; - -private: - /// Last time the registry heard from this worker. The ActiveWorker class - /// will use this to determine the worker's state. - /// &&& Store in seconds since epoch to make atomic? - TIMEPOINT _regUpdate; - - mutable std::mutex _rMtx; ///< protects _regUpdate -}; -*/ - /// &&& doc - maintain list of done/cancelled queries for an active worker, and send /// that list to the worker. Once the worker has accepted the list, remove /// all of those queryId's from the list. @@ -137,11 +75,7 @@ class ActiveWorker : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; - enum State { - ALIVE = 0, - QUESTIONABLE, - DEAD - }; + enum State { ALIVE = 0, QUESTIONABLE, DEAD }; ActiveWorker() = delete; ActiveWorker(ActiveWorker const&) = delete; @@ -154,10 +88,18 @@ class ActiveWorker : public std::enable_shared_from_this { static std::string getStateStr(State st); static Ptr create(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + std::string const& replicationInstanceId, std::string const& replicationAuthKey) { return Ptr(new ActiveWorker(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); } + /// This function should only be called before the _monitor thread is started + /// and shortly after czar startup: it tells all workers to delete all + /// query information for queries with czarId `czId` and queryId less than + /// or equal to `lastQId`. + void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { + _wqsData->setCzarCancelAfterRestart(czId, lastQId); + } + http::WorkerContactInfo::Ptr getWInfo() const { if (_wqsData == nullptr) return nullptr; return _wqsData->_wInfo; @@ -173,13 +115,27 @@ class ActiveWorker : public std::enable_shared_from_this { /// &&& doc void updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime); + /// &&& doc + void addToDoneDeleteFiles(QueryId qId); + + /// &&& doc + void addToDoneKeepFiles(QueryId qId); + + /// &&& doc + void removeDeadUberJobsFor(QueryId qId); + std::string dump() const; private: ///&&&ActiveWorker(WorkerContactInfo::Ptr const& wInfo) : _wInfo(wInfo) {} ActiveWorker(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) - : _wqsData(http::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, replicationAuthKey)) {} + std::string const& replicationInstanceId, std::string const& replicationAuthKey) + : _wqsData(http::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, + replicationAuthKey)) { + if (_wqsData == nullptr) { + throw util::Bug(ERR_LOC, "ActiveWorker _wqsData null"); + } + } /// &&& doc /// _aMtx must be held before calling. @@ -192,21 +148,13 @@ class ActiveWorker : public std::enable_shared_from_this { /// _aMtx must be held before calling. std::string _dump() const; - /* &&& - std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached - std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished - std::map> _qIdDeadUberJobs; ///< &&& doc + /// Contains data that needs to be sent to workers about finished/cancelled + /// user queries and UberJobs. It must not be null. + http::WorkerQueryStatusData::Ptr const _wqsData; - /// &&& TODO:UJ Worth the effort to inform worker of killed UberJobs? - //std::map> _killedUberJobs; + State _state{QUESTIONABLE}; ///< current state of this worker. - WorkerContactInfo::Ptr _wInfo; ///< &&& doc - */ - http::WorkerQueryStatusData::Ptr _wqsData; ///< &&& doc - - State _state{QUESTIONABLE}; ///< current state of this worker. - - mutable std::mutex _aMtx; ///< protects _wInfo, _state, _qIdDoneKeepFiles, _qIdDoneDeleteFiles + mutable std::mutex _aMtx; ///< protects _wInfo, _state, _qIdDoneKeepFiles, _qIdDoneDeleteFiles /// The number of communication threads currently in use by this class instance. std::atomic _conThreadCount{0}; @@ -214,7 +162,7 @@ class ActiveWorker : public std::enable_shared_from_this { /// &&& doc /// @throws std::invalid_argument - bool _parse(nlohmann::json const& jsWorkerReq); // &&& delete after basic testing + bool _parse(nlohmann::json const& jsWorkerReq); // &&& delete after basic testing }; /// &&& doc @@ -229,26 +177,38 @@ class ActiveWorkerMap { ActiveWorkerMap operator=(ActiveWorkerMap const&) = delete; ~ActiveWorkerMap() = default; - std::string cName(const char* fName) { - return std::string("ActiveWorkerMap::") + fName + " "; - } + std::string cName(const char* fName) { return std::string("ActiveWorkerMap::") + fName + " "; } /// &&& doc - void updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey); + void updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, + std::string const& replicationInstanceId, std::string const& replicationAuthKey); - //&&&void pruneMap(); /// &&& may not be needed ??? + /// If this is to be called, it must be called before Czar::_monitor is started: + /// It tells the workers all queries from `czId` with QueryIds less than `lastQId` + /// should be cancelled. + void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId); // &&& doc void sendActiveWorkersMessages(); + /// &&& doc + void addToDoneDeleteFiles(QueryId qId); + + /// &&& doc + void addToDoneKeepFiles(QueryId qId); + private: std::map _awMap; - std::mutex _awMapMtx; ///< protects _awMap; + std::mutex _awMapMtx; ///< protects _awMap; //&&&double const _maxDeadTimeSeconds = 60.0 * 15.0; ///< &&& set from config. - double _timeoutAliveSecs = 60.0 * 5.0; ///< &&& set from config. 5min - double _timeoutDeadSecs = 60.0 * 10.0; ///< &&& set from config. 10min - double _maxLifetime = 60.0 * 60.0; ///< &&& set from config. 1hr + double _timeoutAliveSecs = 60.0 * 5.0; ///< &&& set from config. 5min + double _timeoutDeadSecs = 60.0 * 10.0; ///< &&& set from config. 10min + double _maxLifetime = 60.0 * 60.0; ///< &&& set from config. 1hr + + bool _czarCancelAfterRestart = false; + CzarIdType _czarCancelAfterRestartCzId = 0; + QueryId _czarCancelAfterRestartQId = 0; }; } // namespace lsst::qserv::czar diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index 75bae4107..b9f35cb98 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -146,8 +146,6 @@ void Czar::_monitor() { // &&& Go through the ActiveWorkerMap. Each ActiveWorker instance has a list of QueryIds // that have not yet been acknowledged by the worker, so send a message to each worker // with that list. - - } } @@ -158,7 +156,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) _idCounter(), _uqFactory(), _clientToQuery(), - _activeWorkerMap(new ActiveWorkerMap()){ + _activeWorkerMap(new ActiveWorkerMap()) { // set id counter to milliseconds since the epoch, mod 1 year. struct timeval tv; gettimeofday(&tv, nullptr); @@ -175,9 +173,6 @@ Czar::Czar(string const& configFilePath, string const& czarName) // the name of the Czar gets translated into a numeric identifier. _czarConfig->setId(_uqFactory->userQuerySharedResources()->qMetaCzarId); - // This will block until there is a successful read of the database tables. - _czarFamilyMap = CzarFamilyMap::create(_uqFactory->userQuerySharedResources()->queryMetadata); - // Tell workers to cancel any queries that were submitted before this restart of Czar. // Figure out which query (if any) was recorded in Czar database before the restart. // The id will be used as the high-watermark for queries that need to be cancelled. @@ -186,6 +181,18 @@ Czar::Czar(string const& configFilePath, string const& czarName) // if (_czarConfig->notifyWorkersOnCzarRestart()) { try { + QueryId lastQId = _lastQueryIdBeforeRestart(); + _activeWorkerMap->setCzarCancelAfterRestart(_czarConfig->id(), lastQId); + } catch (std::exception const& ex) { + LOGS(_log, LOG_LVL_WARN, ex.what()); + } + } + /* &&& (moved this and czar crashed instantly, why?) + + if (_czarConfig->notifyWorkersOnCzarRestart()) { + try { + // &&&QM use http - Add flag to each worker in _activeWorkerMap + // TODO:UJ - Workers need to contact the registry and kill queries if the associated czar dies. xrdreq::QueryManagementAction::notifyAllWorkers(_czarConfig->getXrootdFrontendUrl(), proto::QueryManagement::CANCEL_AFTER_RESTART, _czarConfig->id(), _lastQueryIdBeforeRestart()); @@ -193,6 +200,10 @@ Czar::Czar(string const& configFilePath, string const& czarName) LOGS(_log, LOG_LVL_WARN, ex.what()); } } + */ + + // This will block until there is a successful read of the database tables. + _czarFamilyMap = CzarFamilyMap::create(_uqFactory->userQuerySharedResources()->queryMetadata); int qPoolSize = _czarConfig->getQdispPoolSize(); int maxPriority = std::max(0, _czarConfig->getQdispMaxPriority()); diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 7116aa1cc..3e8607b54 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -300,8 +300,10 @@ bool CzarFamilyMap::_read() { return false; } - // &&& TODO:UJ Before makeNewMaps(), get a list of workers considered to be alive by czar::_activeWorkerMap - // give that list to makeNewMaps, and don't and workers to the maps that aren't on the list.&&& !!! + // &&& TODO:UJ Before makeNewMaps(), get a list of workers considered to be alive by + // czar::_activeWorkerMap + // give that list to makeNewMaps, and don't and workers to the maps that aren't on the + // list.&&& !!! // Make the new maps. shared_ptr familyMapPtr = makeNewMaps(qChunkMap); diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 0824d0ad8..c37b5da47 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -125,13 +125,15 @@ void CzarRegistry::_registryWorkerInfoLoop() { http::WorkerContactInfo::WCMapPtr wMap = _buildMapFromJson(response); // Compare the new map to the existing map and replace if different. { - auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), util::get_current_host_fqdn()); + auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), + _czarConfig->replicationHttpPort(), + util::get_current_host_fqdn()); lock_guard lck(_mapMtx); if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { _contactMap = wMap; _latestMapUpdate = CLOCK::now(); - _activeWorkerMap.updateMap(*_contactMap, czInfo, replicationInstanceId, replicationAuthKey); - + _activeWorkerMap.updateMap(*_contactMap, czInfo, replicationInstanceId, + replicationAuthKey); } } } @@ -200,4 +202,17 @@ void CzarRegistry::sendActiveWorkersMessages() { _activeWorkerMap.sendActiveWorkersMessages(); } +void CzarRegistry::endUserQuery(QueryId qId, bool deleteWorkerResults) { + lock_guard lck(_mapMtx); + // Add query id to the appropriate list. + if (deleteWorkerResults) { + _activeWorkerMap.addToDoneDeleteFiles(qId); + } else { + _activeWorkerMap.addToDoneKeepFiles(qId); + } + + // With lists updated, send out messages. + _activeWorkerMap.sendActiveWorkersMessages(); +} + } // namespace lsst::qserv::czar diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index e1e52a6e1..c743c6001 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -77,6 +77,9 @@ class CzarRegistry { /// &&& doc void sendActiveWorkersMessages(); + /// &&& doc + void endUserQuery(QueryId qId, bool deleteWorkerResults); + private: CzarRegistry() = delete; CzarRegistry(std::shared_ptr const& czarConfig); @@ -105,9 +108,10 @@ class CzarRegistry { /// Pointer to the map of worker contact information. http::WorkerContactInfo::WCMapPtr _contactMap; - TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to WorkerContactInfo update. + TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to + ///< WorkerContactInfo update. // &&& review how this _mapMtx is used, probably locks for too long a period. - std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap + std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap ActiveWorkerMap _activeWorkerMap; ///< Map of workers czar considers active. }; diff --git a/src/global/ResourceUnit.h b/src/global/ResourceUnit.h index ad4a1ef0b..c9f983740 100644 --- a/src/global/ResourceUnit.h +++ b/src/global/ResourceUnit.h @@ -42,7 +42,7 @@ namespace lsst::qserv { /// construction, the code for generating a path that includes the key-value /// portion is not implemented. It is unclear whether we need the generation /// capability, now that key-value pairs can be packed in protobufs messages. -class ResourceUnit { +class ResourceUnit { // TODO:UJ &&& delete if possible public: class Checker; enum UnitType { GARBAGE, DBCHUNK, UNKNOWN, QUERY }; diff --git a/src/global/intTypes.h b/src/global/intTypes.h index f4b4197f7..8463644e5 100644 --- a/src/global/intTypes.h +++ b/src/global/intTypes.h @@ -38,8 +38,8 @@ typedef std::vector Int32Vector; /// Typedef for Query ID in query metadata. typedef std::uint64_t QueryId; typedef std::int64_t JobId; -typedef JobId UberJobId; // These must be the same type. -typedef std::uint32_t CzarIdType; // TODO:UJ remove qmeta::CzarId and rename this CzarId +typedef JobId UberJobId; // These must be the same type. +typedef std::uint32_t CzarIdType; // TODO:UJ remove qmeta::CzarId and rename this CzarId /// Class to provide a consistent format for QueryIds in the log file class QueryIdHelper { diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index cd254f7c0..aed6bf73b 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -46,7 +46,7 @@ namespace lsst::qserv::http { json CzarContactInfo::serializeJson() const { json jsCzar; jsCzar["name"] = czName; - jsCzar["id"]= czId; + jsCzar["id"] = czId; jsCzar["management-port"] = czPort; jsCzar["management-host-name"] = czHostName; return jsCzar; @@ -71,11 +71,9 @@ std::string CzarContactInfo::dump() const { return os.str(); } - - json WorkerContactInfo::serializeJson() const { json jsWorker; - jsWorker["id"]= wId; + jsWorker["id"] = wId; jsWorker["host"] = wHost; jsWorker["management-host-name"] = wManagementHost; jsWorker["management-port"] = wPort; @@ -100,8 +98,6 @@ WorkerContactInfo::Ptr WorkerContactInfo::createJson(nlohmann::json const& wJson return nullptr; } - - string WorkerContactInfo::dump() const { stringstream os; os << "workerContactInfo{" @@ -130,41 +126,44 @@ void WorkerQueryStatusData::setWorkerContactInfo(WorkerContactInfo::Ptr const& w } */ - -shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { - +shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a // message to send to the worker. auto now = CLOCK::now(); - //&&&auto const czarConfig = cconfig::CzarConfig::instance(); - shared_ptr jsWorkerReqPtr = make_shared(); json& jsWorkerR = *jsWorkerReqPtr; jsWorkerR["version"] = http::MetaModule::version; - /* &&& - jsWorkerR["instance_id"] = czarConfig->replicationInstanceId(); - jsWorkerR["auth_key"] = czarConfig->replicationAuthKey(); - */ jsWorkerR["instance_id"] = _replicationInstanceId; jsWorkerR["auth_key"] = _replicationAuthKey; - //&&&jsWorkerR["worker"] = _wInfo->wId; - jsWorkerR["qiddonekeepfiles"] = json::array(); - jsWorkerR["qiddonedeletefiles"] = json::array(); - jsWorkerR["qiddeaduberjobs"] = json::array(); - //&&&jsWorkerR["czar"] = json::object(); jsWorkerR["czar"] = _czInfo->serializeJson(); - //&&&jsWorkerR["worker"] = json::object(); jsWorkerR["worker"] = _wInfo->serializeJson(); + addListsToJson(jsWorkerR, now, maxLifetime); + if (_czarCancelAfterRestart) { + jsWorkerR["czarrestart"] = true; + lock_guard mapLg(_mapMtx); + jsWorkerR["czarrestartcancelczid"] = _czarCancelAfterRestartCzId; + jsWorkerR["czarrestartcancelqid"] = _czarCancelAfterRestartQId; + } else { + jsWorkerR["czarrestart"] = false; + } + + return jsWorkerReqPtr; +} +void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxLifetime) { + jsWR["qiddonekeepfiles"] = json::array(); + jsWR["qiddonedeletefiles"] = json::array(); + jsWR["qiddeaduberjobs"] = json::array(); + lock_guard mapLg(_mapMtx); { - auto& jsDoneKeep = jsWorkerR["qiddonekeepfiles"]; + auto& jsDoneKeep = jsWR["qiddonekeepfiles"]; auto iterDoneKeep = _qIdDoneKeepFiles.begin(); while (iterDoneKeep != _qIdDoneKeepFiles.end()) { auto qId = iterDoneKeep->first; jsDoneKeep.push_back(qId); auto tmStamp = iterDoneKeep->second; - double ageSecs = std::chrono::duration(now - tmStamp).count(); + double ageSecs = std::chrono::duration(tm - tmStamp).count(); if (ageSecs > maxLifetime) { iterDoneKeep = _qIdDoneKeepFiles.erase(iterDoneKeep); } else { @@ -173,13 +172,13 @@ shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, d } } { - auto& jsDoneDelete = jsWorkerR["qiddonedeletefiles"]; + auto& jsDoneDelete = jsWR["qiddonedeletefiles"]; auto iterDoneDelete = _qIdDoneDeleteFiles.begin(); while (iterDoneDelete != _qIdDoneDeleteFiles.end()) { auto qId = iterDoneDelete->first; jsDoneDelete.push_back(qId); auto tmStamp = iterDoneDelete->second; - double ageSecs = std::chrono::duration(now - tmStamp).count(); + double ageSecs = std::chrono::duration(tm - tmStamp).count(); if (ageSecs > maxLifetime) { iterDoneDelete = _qIdDoneDeleteFiles.erase(iterDoneDelete); } else { @@ -188,10 +187,10 @@ shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, d } } { - auto& jsDeadUj = jsWorkerR["qiddeaduberjobs"]; + auto& jsDeadUj = jsWR["qiddeaduberjobs"]; auto iterDeadUjQid = _qIdDeadUberJobs.begin(); while (iterDeadUjQid != _qIdDeadUberJobs.end()) { - TIMEPOINT oldestTm; // default is zero + TIMEPOINT oldestTm; // default is zero auto qId = iterDeadUjQid->first; auto& ujIdMap = iterDeadUjQid->second; @@ -209,7 +208,7 @@ shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, d jsUjIds.push_back(ujId); addedUjId = true; - double ageSecs = std::chrono::duration(now - tmStamp).count(); + double ageSecs = std::chrono::duration(tm - tmStamp).count(); if (ageSecs > maxLifetime) { iterUjId = ujIdMap.erase(iterUjId); } else { @@ -221,25 +220,19 @@ shared_ptr WorkerQueryStatusData::serializeJson(double timeoutAliveSecs, d jsDeadUj.push_back(jsQidUj); } - if (ujIdMap.empty() - || std::chrono::duration(now - oldestTm).count() > maxLifetime) { + if (ujIdMap.empty() || std::chrono::duration(tm - oldestTm).count() > maxLifetime) { iterDeadUjQid = _qIdDeadUberJobs.erase(iterDeadUjQid); } else { ++iterDeadUjQid; } } } - - /* &&& happens in the caller now. - // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) - // put this in a different function and start the thread.&&&; - _sendStatusMsg(jsWorkerReqPtr); - */ - return jsWorkerReqPtr; } WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json const& jsWorkerReq, - std::string const& replicationInstanceId, std::string const& replicationAuthKey, TIMEPOINT updateTm) { + std::string const& replicationInstanceId, + std::string const& replicationAuthKey, + TIMEPOINT updateTm) { LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& a"); try { if (jsWorkerReq["version"] != http::MetaModule::version) { @@ -253,17 +246,21 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json cons auto wInfo_ = WorkerContactInfo::createJson(jsWorkerReq["worker"], updateTm); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& d"); if (czInfo_ == nullptr || wInfo_ == nullptr) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " << jsWorkerReq); + LOGS(_log, LOG_LVL_ERROR, + "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " + << jsWorkerReq); } - auto wqsData = WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId, replicationAuthKey); + auto wqsData = + WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId, replicationAuthKey); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& e"); - - auto parseRes = wqsData->_parseLists(jsWorkerReq, updateTm); - if (!parseRes) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson error reading lists in " << jsWorkerReq); - return nullptr; - } + wqsData->parseLists(jsWorkerReq, updateTm); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& end"); + bool czarRestart = RequestBodyJSON::required(jsWorkerReq, "czarrestart"); + if (czarRestart) { + auto restartCzarId = RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelczid"); + auto restartQueryId = RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelqid"); + wqsData->setCzarCancelAfterRestart(restartCzarId, restartQueryId); + } return wqsData; } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::createJson invalid ") << exc.what()); @@ -271,48 +268,52 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json cons return nullptr; } -bool WorkerQueryStatusData::_parseLists(nlohmann::json const& jsWorkerReq, TIMEPOINT updateTm) { - try { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& a"); - auto& jsQIdDoneKeepFiles = jsWorkerReq["qiddonekeepfiles"]; - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& b"); - for (auto const& qidKeep : jsQIdDoneKeepFiles) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& b1"); - _qIdDoneKeepFiles[qidKeep] = updateTm; - } +void WorkerQueryStatusData::parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm) { + lock_guard mapLg(_mapMtx); + parseListsInto(jsWR, updateTm, _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs); +} - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& c"); - auto& jsQIdDoneDeleteFiles = jsWorkerReq["qiddonedeletefiles"]; - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& d"); - for (auto const& qidDelete : jsQIdDoneDeleteFiles) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& d1"); - _qIdDoneDeleteFiles[qidDelete] = updateTm; - } +void WorkerQueryStatusData::parseListsInto(nlohmann::json const& jsWR, TIMEPOINT updateTm, + std::map& doneKeepF, + std::map& doneDeleteF, + std::map>& deadUberJobs) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& a"); + auto& jsQIdDoneKeepFiles = jsWR["qiddonekeepfiles"]; + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& b"); + for (auto const& qidKeep : jsQIdDoneKeepFiles) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& b1"); + doneKeepF[qidKeep] = updateTm; + } - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& e"); - auto& jsQIdDeadUberJobs = jsWorkerReq["qiddeaduberjobs"]; - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f jsQIdDeadUberJobs=" << jsQIdDeadUberJobs); - // Interestingly, !jsQIdDeadUberJobs.empty() doesn't work, but .size() > 0 does. - // Not having the size() check causes issues with the for loop trying to read the - // first element of an empty list, which goes badly. - if (jsQIdDeadUberJobs.size() > 0) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1"); - for (auto const& qDeadUjs : jsQIdDeadUberJobs) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1a qDeadUjs=" << qDeadUjs); - QueryId qId = qDeadUjs["qid"]; - auto const& ujIds = qDeadUjs["ujids"]; - auto& mapOfUj = _qIdDeadUberJobs[qId]; - for (auto const& ujId : ujIds) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& f1d1 qId=" << qId << " ujId=" << ujId); - mapOfUj[ujId] = updateTm; - } + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& c"); + auto& jsQIdDoneDeleteFiles = jsWR["qiddonedeletefiles"]; + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& d"); + for (auto const& qidDelete : jsQIdDoneDeleteFiles) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& d1"); + doneDeleteF[qidDelete] = updateTm; + } + + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& e"); + auto& jsQIdDeadUberJobs = jsWR["qiddeaduberjobs"]; + LOGS(_log, LOG_LVL_ERROR, + "WorkerQueryStatusData::parseListsInto &&& f jsQIdDeadUberJobs=" << jsQIdDeadUberJobs); + // Interestingly, !jsQIdDeadUberJobs.empty() doesn't work, but .size() > 0 does. + // Not having the size() check causes issues with the for loop trying to read the + // first element of an empty list, which goes badly. + if (jsQIdDeadUberJobs.size() > 0) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& f1"); + for (auto const& qDeadUjs : jsQIdDeadUberJobs) { + LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& f1a qDeadUjs=" << qDeadUjs); + QueryId qId = qDeadUjs["qid"]; + auto const& ujIds = qDeadUjs["ujids"]; + auto& mapOfUj = deadUberJobs[qId]; + for (auto const& ujId : ujIds) { + LOGS(_log, LOG_LVL_ERROR, + "WorkerQueryStatusData::parseListsInto &&& f1d1 qId=" << qId << " ujId=" << ujId); + mapOfUj[ujId] = updateTm; } } - } catch (invalid_argument const& exc) { - LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::_parseLists invalid ") << exc.what()); - return false; } - return true; } void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm) { @@ -322,10 +323,71 @@ void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector } } +void WorkerQueryStatusData::addToDoneDeleteFiles(QueryId qId) { + lock_guard mapLg(_mapMtx); + _qIdDoneDeleteFiles[qId] = CLOCK::now(); +} + +void WorkerQueryStatusData::addToDoneKeepFiles(QueryId qId) { + lock_guard mapLg(_mapMtx); + _qIdDoneKeepFiles[qId] = CLOCK::now(); +} + +void WorkerQueryStatusData::removeDeadUberJobsFor(QueryId qId) { + lock_guard mapLg(_mapMtx); + _qIdDeadUberJobs.erase(qId); +} + +json WorkerQueryStatusData::serializeResponseJson() { + // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a + // reponse. Nothing should be deleted and time is irrelevant for this, so maxLifetime is enormous + // and any time could be used, but now is easy. + double maxLifetime = std::numeric_limits::max(); + auto now = CLOCK::now(); + json jsResp = {{"success", 1}, {"errortype", "none"}, {"note", ""}}; + addListsToJson(jsResp, now, maxLifetime); + return jsResp; +} + +bool WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { + auto now = CLOCK::now(); + std::map doneKeepF; + std::map doneDeleteF; + std::map> deadUberJobs; + parseListsInto(jsResp, now, doneKeepF, doneDeleteF, deadUberJobs); + + lock_guard mapLg(_mapMtx); + // Remove entries from _qIdDoneKeepFiles + for (auto const& [qId, tm] : doneKeepF) { + _qIdDoneKeepFiles.erase(qId); + } + + // Remove entries from _qIdDoneDeleteFiles + for (auto const& [qId, tm] : doneDeleteF) { + _qIdDoneDeleteFiles.erase(qId); + } + + // Remove entries from _qIdDeadUberJobs + for (auto const& [qId, ujMap] : deadUberJobs) { + auto iter = _qIdDeadUberJobs.find(qId); + if (iter != _qIdDeadUberJobs.end()) { + auto& deadMap = iter->second; + for (auto const& [ujId, tm] : ujMap) { + deadMap.erase(ujId); + } + if (deadMap.empty()) { + _qIdDeadUberJobs.erase(iter); + } + } + } + + return true; +} + string WorkerQueryStatusData::dump() const { stringstream os; os << "ActiveWorker " << ((_wInfo == nullptr) ? "?" : _wInfo->dump()); return os.str(); } -} // namespace lsst::qserv::czar +} // namespace lsst::qserv::http diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index f0f6c1aaa..44282462c 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -35,7 +35,6 @@ #include "global/clock_defs.h" #include "global/intTypes.h" - // This header declarations namespace lsst::qserv::http { @@ -43,9 +42,7 @@ namespace lsst::qserv::http { class CzarContactInfo { public: using Ptr = std::shared_ptr; - std::string cName(const char* fnc) const { - return std::string("CzarContactInfo") + fnc; - } + std::string cName(const char* fnc) const { return std::string("CzarContactInfo") + fnc; } CzarContactInfo() = delete; CzarContactInfo(CzarContactInfo const&) = default; @@ -53,19 +50,21 @@ class CzarContactInfo { /// &&& doc bool compare(CzarContactInfo const& other) { - return (czName == other.czName && czId == other.czId && czPort == other.czPort && czHostName == other.czHostName); + return (czName == other.czName && czId == other.czId && czPort == other.czPort && + czHostName == other.czHostName); } - static Ptr create(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_) { + static Ptr create(std::string const& czName_, CzarIdType czId_, int czPort_, + std::string const& czHostName_) { return Ptr(new CzarContactInfo(czName_, czId_, czPort_, czHostName_)); } static Ptr createJson(nlohmann::json const& czarJson); - std::string const czName; ///< czar "name" - CzarIdType const czId; ///< czar "id" - int const czPort; ///< czar "management-port" - std::string const czHostName; ///< czar "management-host-name" + std::string const czName; ///< czar "name" + CzarIdType const czId; ///< czar "id" + int const czPort; ///< czar "management-port" + std::string const czHostName; ///< czar "management-host-name" /// &&& doc nlohmann::json serializeJson() const; @@ -83,10 +82,9 @@ class CzarContactInfo { */ private: CzarContactInfo(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_) - : czName(czName_), czId(czId_), czPort(czPort_), czHostName(czHostName_) {} + : czName(czName_), czId(czId_), czPort(czPort_), czHostName(czHostName_) {} }; - /// &&& doc This class just contains the worker id and network communication /// information, but it may be desirable to store connections to the /// worker here as well. @@ -97,10 +95,9 @@ class WorkerContactInfo { using WCMap = std::unordered_map; using WCMapPtr = std::shared_ptr; - static Ptr create(std::string const& wId_, std::string const& wHost_, - std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) { - return Ptr(new WorkerContactInfo(wId_, wHost_, - wManagementHost_, wPort_, updateTime_)); + static Ptr create(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, + int wPort_, TIMEPOINT updateTime_) { + return Ptr(new WorkerContactInfo(wId_, wHost_, wManagementHost_, wPort_, updateTime_)); } /// &&& doc @@ -112,12 +109,9 @@ class WorkerContactInfo { std::string cName(const char* fn) { return std::string("WorkerContactInfo::") + fn; } /// &&& make private - WorkerContactInfo(std::string const& wId_, std::string const& wHost_, - std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) - : wId(wId_), - wHost(wHost_), - wManagementHost(wManagementHost_), - wPort(wPort_) { + WorkerContactInfo(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, + int wPort_, TIMEPOINT updateTime_) + : wId(wId_), wHost(wHost_), wManagementHost(wManagementHost_), wPort(wPort_) { regUpdateTime(updateTime_); } std::string const wId; ///< key @@ -125,7 +119,6 @@ class WorkerContactInfo { std::string const wManagementHost; ///< "management-host-name" entry. int const wPort; ///< "management-port" entry. - /// Return true if all members, aside from updateTime, are equal. bool isSameContactInfo(WorkerContactInfo const& other) const { return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && @@ -156,42 +149,31 @@ class WorkerContactInfo { /// &&& Store in seconds since epoch to make atomic? TIMEPOINT _regUpdate; - mutable std::mutex _rMtx; ///< protects _regUpdate + mutable std::mutex _rMtx; ///< protects _regUpdate }; - /// &&& doc -class WorkerQueryStatusData { +class WorkerQueryStatusData { public: using Ptr = std::shared_ptr; - /* &&& - enum State { - ALIVE = 0, - QUESTIONABLE, - DEAD - }; - */ - WorkerQueryStatusData() = delete; WorkerQueryStatusData(WorkerQueryStatusData const&) = delete; WorkerQueryStatusData& operator=(WorkerQueryStatusData const&) = delete; std::string cName(const char* fName) { - return std::string("WorkerQueryStatusData::") + fName + " " + ((_wInfo == nullptr) ? "?" : _wInfo->wId); + return std::string("WorkerQueryStatusData::") + fName + " " + + ((_wInfo == nullptr) ? "?" : _wInfo->wId); } - //&&&static std::string getStateStr(State st); - static Ptr create(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + std::string const& replicationInstanceId, std::string const& replicationAuthKey) { return Ptr(new WorkerQueryStatusData(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); } /// &&& doc - static Ptr createJson(nlohmann::json const& czarJson, - std::string const& replicationInstanceId, std::string const& replicationAuthKey, TIMEPOINT updateTm); - + static Ptr createJson(nlohmann::json const& czarJson, std::string const& replicationInstanceId, + std::string const& replicationAuthKey, TIMEPOINT updateTm); ~WorkerQueryStatusData() = default; @@ -200,33 +182,79 @@ class WorkerQueryStatusData { /// &&& doc void addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm); + /// &&& doc + void addToDoneDeleteFiles(QueryId qId); + + /// &&& doc + void addToDoneKeepFiles(QueryId qId); + + /// &&& doc + void removeDeadUberJobsFor(QueryId qId); + + void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { + std::lock_guard mapLg(_mapMtx); + _czarCancelAfterRestart = true; + _czarCancelAfterRestartCzId = czId; + _czarCancelAfterRestartQId = lastQId; + } + + bool isCzarRestart() const { return _czarCancelAfterRestart; } + CzarIdType getCzarRestartCzarId() const { return _czarCancelAfterRestartCzId; } + QueryId getCzarRestartQueryId() const { return _czarCancelAfterRestartQId; } + std::string dump() const; -//&&&private: + //&&&private: // &&& Most of this needs to be made private again. WorkerQueryStatusData(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) - : _wInfo(wInfo), _czInfo(czInfo), - _replicationInstanceId(replicationInstanceId), _replicationAuthKey(replicationAuthKey) {} + std::string const& replicationInstanceId, std::string const& replicationAuthKey) + : _wInfo(wInfo), + _czInfo(czInfo), + _replicationInstanceId(replicationInstanceId), + _replicationAuthKey(replicationAuthKey) {} - std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached - std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished - std::map> _qIdDeadUberJobs; ///< &&& doc + std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached + std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + std::map> _qIdDeadUberJobs; ///< &&& doc + std::atomic _czarCancelAfterRestart = false; + CzarIdType _czarCancelAfterRestartCzId = 0; + QueryId _czarCancelAfterRestartQId = 0; - /// &&& TODO:UJ Worth the effort to inform worker of killed UberJobs? - //std::map> _killedUberJobs; + /// Protects _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs, + /// and czarCancelAfter variables. + mutable std::mutex _mapMtx; - WorkerContactInfo::Ptr _wInfo; ///< &&& doc - CzarContactInfo::Ptr _czInfo; //< &&& doc + WorkerContactInfo::Ptr _wInfo; ///< &&& doc make const??? + CzarContactInfo::Ptr _czInfo; //< &&& doc make const??? - std::string const _replicationInstanceId; ///< &&& doc - std::string const _replicationAuthKey; ///< &&& doc + std::string const _replicationInstanceId; ///< &&& doc + std::string const _replicationAuthKey; ///< &&& doc - /// &&& doc - std::shared_ptr serializeJson(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime); + /// Create a json object held by a shared pointer to use as a message. + /// Old objects in this instance will be removed after being added to the + /// json message. + std::shared_ptr serializeJson(double maxLifetime); + + /// Add contents of qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs to `jsWR` + void addListsToJson(nlohmann::json& jsWR, TIMEPOINT tm, double maxLifetime); /// &&& doc /// @throws std::invalid_argument - bool _parseLists(nlohmann::json const& jsWorkerReq, TIMEPOINT updateTm); // &&& delete after basic testing + void parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm); + + /// &&& doc + nlohmann::json serializeResponseJson(); + + /// &&& doc + bool handleResponseJson(nlohmann::json const& jsResp); + + /// &&& doc + ///&&&void handleCzarRestart(); + + /// &&& doc + static void parseListsInto(nlohmann::json const& jsWR, TIMEPOINT updateTm, + std::map& doneKeepF, + std::map& doneDeleteF, + std::map>& deadUberJobs); }; } // namespace lsst::qserv::http diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index 97767dd9f..191053631 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -44,16 +44,15 @@ using namespace lsst::qserv::http; BOOST_AUTO_TEST_SUITE(Suite) BOOST_AUTO_TEST_CASE(CzarContactInfo) { - string const replicationInstanceId = "repliInstId"; string const replicationAuthKey = "repliIAuthKey"; - string const cName("czar_name"); - lsst::qserv::CzarIdType const cId = 32; - int cPort = 2022; - string const cHost("cz_host"); + string const czrName("czar_name"); + lsst::qserv::CzarIdType const czrId = 32; + int czrPort = 2022; + string const czrHost("cz_host"); - auto czarA = lsst::qserv::http::CzarContactInfo::create(cName, cId, cPort, cHost); + auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost); LOGS_ERROR("&&& a czarA=" << czarA->dump()); auto czarAJs = czarA->serializeJson(); @@ -63,7 +62,7 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { LOGS_ERROR("&&& c czarB=" << czarB); BOOST_REQUIRE(czarA->compare(*czarB)); - auto czarC = lsst::qserv::http::CzarContactInfo::create("different", cId, cPort, cHost); + auto czarC = lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost); BOOST_REQUIRE(!czarA->compare(*czarC)); auto start = lsst::qserv::CLOCK::now(); @@ -80,29 +79,30 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { BOOST_REQUIRE(workerA->isSameContactInfo(*workerA1)); // WorkerQueryStatusData - auto wqsdA = lsst::qserv::http::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, replicationAuthKey); + auto wqsdA = lsst::qserv::http::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, + replicationAuthKey); LOGS_ERROR("&&& g wqsdA=" << wqsdA->dump()); - double timeoutAliveSecs = 100.0; - double timeoutDeadSecs = 2*timeoutAliveSecs; + //&&&double timeoutAliveSecs = 100.0; + //&&&double timeoutDeadSecs = 2*timeoutAliveSecs; double maxLifetime = 300.0; - auto jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + auto jsDataA = wqsdA->serializeJson(maxLifetime); LOGS_ERROR("&&& h jsDataA=" << *jsDataA); // Check that empty lists work. - auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start1Sec); + auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, + replicationAuthKey, start1Sec); LOGS_ERROR("&&& i wqsdA1=" << wqsdA1->dump()); - auto jsDataA1 = wqsdA1->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + auto jsDataA1 = wqsdA1->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsDataA1); - - vector qIdsDelFiles = { 7, 8, 9, 15, 25, 26, 27, 30 }; - vector qIdsKeepFiles = { 1, 2, 3, 4, 6, 10, 13, 19, 33 }; + vector qIdsDelFiles = {7, 8, 9, 15, 25, 26, 27, 30}; + vector qIdsKeepFiles = {1, 2, 3, 4, 6, 10, 13, 19, 33}; for (auto const qIdDF : qIdsDelFiles) { wqsdA->_qIdDoneDeleteFiles[qIdDF] = start; } - jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + jsDataA = wqsdA->serializeJson(maxLifetime); LOGS_ERROR("&&& j jsDataA=" << jsDataA); BOOST_REQUIRE(*jsDataA != *jsDataA1); @@ -114,27 +114,42 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { LOGS_ERROR("&&& i wqsdA=" << wqsdA->dump()); - jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + jsDataA = wqsdA->serializeJson(maxLifetime); LOGS_ERROR("&&& j jsDataA=" << *jsDataA); auto start5Sec = start + 5s; - auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); - auto jsWorkerAFromJson = workerAFromJson->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson( + *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); + auto jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); wqsdA->addDeadUberJobs(12, {34}, start5Sec); wqsdA->addDeadUberJobs(91, {77}, start5Sec); wqsdA->addDeadUberJobs(1059, {1, 4, 6, 7, 8, 10, 3, 22, 93}, start5Sec); - jsDataA = wqsdA->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + jsDataA = wqsdA->serializeJson(maxLifetime); LOGS_ERROR("&&& k jsDataA=" << *jsDataA); BOOST_REQUIRE(*jsDataA != *jsWorkerAFromJson); - workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); - jsWorkerAFromJson = workerAFromJson->serializeJson(timeoutAliveSecs, timeoutDeadSecs, maxLifetime); + workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, + replicationAuthKey, start5Sec); + jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); LOGS_ERROR("&&& l jsWorkerAFromJson=" << *jsWorkerAFromJson); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); + // Make the response, which contains lists of the items handled by the workers. + auto jsWorkerResp = workerAFromJson->serializeResponseJson(); + + // test removal of elements after response. + BOOST_REQUIRE(!wqsdA->_qIdDoneDeleteFiles.empty()); + BOOST_REQUIRE(!wqsdA->_qIdDoneKeepFiles.empty()); + BOOST_REQUIRE(!wqsdA->_qIdDeadUberJobs.empty()); + + wqsdA->handleResponseJson(jsWorkerResp); + + BOOST_REQUIRE(wqsdA->_qIdDoneDeleteFiles.empty()); + BOOST_REQUIRE(wqsdA->_qIdDoneKeepFiles.empty()); + BOOST_REQUIRE(wqsdA->_qIdDeadUberJobs.empty()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/proto/ScanTableInfo.h b/src/proto/ScanTableInfo.h index bb362c51d..76d03e5f4 100644 --- a/src/proto/ScanTableInfo.h +++ b/src/proto/ScanTableInfo.h @@ -35,7 +35,7 @@ namespace lsst::qserv::proto { /// Structure to store shared scan information for a single table. /// -struct ScanTableInfo { // &&& check if still useful +struct ScanTableInfo { // &&& check if still useful using ListOf = std::vector; ScanTableInfo() = default; diff --git a/src/proto/worker.proto b/src/proto/worker.proto index 4ef2ae4e7..0310420ed 100644 --- a/src/proto/worker.proto +++ b/src/proto/worker.proto @@ -89,7 +89,7 @@ message WorkerCommandStatus { optional string error = 2 [default = ""]; // Optional error message (depends on the code) } -// &&& try to eliminate this +// &&&QM try to eliminate this message QueryManagement { enum Operation { CANCEL_AFTER_RESTART = 1; // Cancel older queries before the specified query (excluding that one). diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 125d1987a..3d5463a88 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -489,14 +489,20 @@ void Executive::_squashSuperfluous() { LOGS(_log, LOG_LVL_DEBUG, "Executive::squashSuperfluous done"); } -void Executive::sendWorkerCancelMsg(bool deleteResults) { +void Executive::sendWorkerCancelMsg(bool deleteResults) { // &&&QM rename sendEndMsgs // TODO:UJ need to send a message to the worker that the query is cancelled and all result files // should be delete + // &&&QM + // TODO:UJ &&& worker needs to monitor registry to see if czar dies + // &&& - worker will need to kill related queries/uberjobs and store info to send to the + // &&& dead czar in case it comes back to life. LOGS(_log, LOG_LVL_ERROR, "TODO:UJ NEED CODE Executive::sendWorkerCancelMsg to send messages to workers to cancel this czarId " "+ " "queryId. " << deleteResults); + + czar::Czar::getCzar()->getCzarRegistry()->endUserQuery(_id, deleteResults); // &&&QM } int Executive::getNumInflight() const { diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 5c4beba84..cdbb967f6 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -52,7 +52,7 @@ #include "util/ThreadPool.h" // TODO:UJ replace with better enable/disable feature, or just use only UberJobs -#define uberJobsEnabled 1 // &&& delete +#define uberJobsEnabled 1 // &&& delete namespace lsst::qserv { @@ -255,8 +255,8 @@ class Executive : public std::enable_shared_from_this { std::atomic _empty{true}; std::shared_ptr _messageStore; ///< MessageStore for logging - JobMap _jobMap; ///< Contains information about all jobs. - JobMap _incompleteJobs; ///< Map of incomplete jobs. + JobMap _jobMap; ///< Contains information about all jobs. + JobMap _incompleteJobs; ///< Map of incomplete jobs. /// How many jobs are used in this query. 1 avoids possible 0 of 0 jobs completed race condition. /// The correct value is set when it is available. std::atomic _totalJobs{1}; @@ -267,7 +267,7 @@ class Executive : public std::enable_shared_from_this { /** Execution errors */ util::MultiError _multiError; - std::atomic _requestCount{0}; ///< Count of submitted jobs + std::atomic _requestCount{0}; ///< Count of submitted jobs util::Flag _cancelled{false}; ///< Has execution been cancelled. // Mutexes diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index 1a4239457..a77476daa 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -59,7 +59,8 @@ class JobBase : public std::enable_shared_from_this { virtual UberJobId getJobId() const = 0; virtual std::string const& getIdStr() const = 0; virtual std::shared_ptr getQdispPool() = 0; - //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for xrootd + //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for + //xrootd virtual std::shared_ptr getRespHandler() = 0; virtual std::shared_ptr getStatus() = 0; virtual bool getScanInteractive() const = 0; diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index ad8d3e62b..ab4234545 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -102,8 +102,8 @@ bool JobDescription::getScanInteractive() const { return _chunkQuerySpec->scanIn int JobDescription::getScanRating() const { return _chunkQuerySpec->scanInfo.scanRating; } ostream& operator<<(ostream& os, JobDescription const& jd) { - os << "job(id=" << jd._jobId << " ru=" << jd._resource.path() - << " attemptCount=" << jd._attemptCount << ")"; + os << "job(id=" << jd._jobId << " ru=" << jd._resource.path() << " attemptCount=" << jd._attemptCount + << ")"; return os; } diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 7245e60a2..ad28f5c7e 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -63,7 +63,7 @@ JobQuery::~JobQuery() { } /// Cancel response handling. Return true if this is the first time cancel has been called. -bool JobQuery::cancel(bool superfluous) { /// &&& This can probably be simplified more +bool JobQuery::cancel(bool superfluous) { /// &&& This can probably be simplified more QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel()"); if (_cancelled.exchange(true) == false) { diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 06f45ba72..b29baf266 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -95,7 +95,8 @@ class UberJob : public JobBase { /// Set the worker information needed to send messages to the worker believed to /// be responsible for the chunks handled in this UberJob. - void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? + void setWorkerContactInfo( + http::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? _wContactInfo = wContactInfo; } @@ -159,7 +160,7 @@ class UberJob : public JobBase { czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed // Contact information for the target worker. - http::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? + http::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? }; } // namespace lsst::qserv::qdisp diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index b750af776..ce3a4069d 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -337,7 +337,8 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { { LOGS_DEBUG("ExecutiveCancel: squash it test"); SetupTest tEnv("respdata"); - //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash + //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before + //squash SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->squash(); @@ -355,7 +356,8 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { { LOGS_DEBUG("ExecutiveCancel: squash 20 test"); SetupTest tEnv("respdata"); - //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before squash + //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before + //squash SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 20); tEnv.ex->squash(); diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index 722d4ea0c..cf8c06fc3 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -365,15 +365,9 @@ bool FileChannelShared::buildAndTransmitError(util::MultiError& multiErr, shared } return true; } else { - auto ujData = _uberJobData.lock(); - if (ujData == nullptr) { - LOGS(_log, LOG_LVL_WARN, - __func__ << " not sending error as ujData is null " << multiErr.toString()); - return false; - } // Delete the result file as nobody will come looking for it. _kill(tMtxLock, " buildAndTransmitError"); - return ujData->responseError(multiErr, task, cancelled); + return _uberJobData->responseError(multiErr, task, cancelled); } return false; } @@ -660,13 +654,8 @@ bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ return false; } } else { - auto ujData = _uberJobData.lock(); - if (ujData == nullptr) { - LOGS(_log, LOG_LVL_WARN, __func__ << " uberJobData is nullptr for ujId=" << _uberJobId); - return false; - } string httpFileUrl = task->resultFileHttpUrl(); - ujData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); + _uberJobData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); } return true; } diff --git a/src/wbase/FileChannelShared.h b/src/wbase/FileChannelShared.h index 102f87fe2..69e4268fe 100644 --- a/src/wbase/FileChannelShared.h +++ b/src/wbase/FileChannelShared.h @@ -243,7 +243,7 @@ class FileChannelShared { bool _isUberJob; ///< true if this is using UberJob http. To be removed when _sendChannel goes away. std::shared_ptr const _sendChannel; ///< Used to pass encoded information to XrdSsi. - std::weak_ptr _uberJobData; ///< Pointer to UberJobData + std::shared_ptr _uberJobData; ///< Contains czar contact info. UberJobId const _uberJobId; ///< The UberJobId qmeta::CzarId const _czarId; ///< id of the czar that requested this task(s). TODO:UJ delete diff --git a/src/wbase/MsgProcessor.h b/src/wbase/MsgProcessor.h index 8b48de7ec..4f875f93e 100644 --- a/src/wbase/MsgProcessor.h +++ b/src/wbase/MsgProcessor.h @@ -42,21 +42,23 @@ class WorkerCommand; namespace lsst::qserv::wbase { /// MsgProcessor implementations handle incoming Task objects. -struct MsgProcessor { // &&& delete file if possible +struct MsgProcessor { // &&& delete file if possible virtual ~MsgProcessor() {} /// Process a group of query processing tasks. - virtual void processTasks(std::vector> const& tasks) = 0; // &&& delete + virtual void processTasks(std::vector> const& tasks) = 0; // &&& delete /// Process a managememt command - virtual void processCommand(std::shared_ptr const& command) = 0; // &&& can this be deleted + virtual void processCommand( + std::shared_ptr const& command) = 0; // &&& can this be deleted /** * Retreive the status of queries being processed by the worker. * @param taskSelector Task selection criterias. * @return a JSON representation of the object's status for the monitoring */ - virtual nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector) = 0; // &&& can this be deleted + virtual nlohmann::json statusToJson( + wbase::TaskSelector const& taskSelector) = 0; // &&& can this be deleted }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index cc7c1668f..b3c4f8818 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -120,13 +120,21 @@ atomic taskSequence{0}; ///< Unique identifier source for Task. /// available to define the action to take when this task is run, so /// Command::setFunc() is used set the action later. This is why /// the util::CommandThreadPool is not called here. +/* &&& Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, shared_ptr const& userQueryInfo, size_t templateId, bool hasSubchunks, int subchunkId, string const& db, proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSize, vector const& fragSubTables, vector const& fragSubchunkIds, - shared_ptr const& sc, uint16_t resultsHttpPort) - : _userQueryInfo(userQueryInfo), - _sendChannel(sc), + shared_ptr const& sc, std::shared_ptr const& +queryStats_, uint16_t resultsHttpPort) : _userQueryInfo(userQueryInfo), +*/ +Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, + size_t templateId, bool hasSubchunks, int subchunkId, string const& db, + proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSize, + vector const& fragSubTables, vector const& fragSubchunkIds, + shared_ptr const& sc, + std::shared_ptr const& queryStats_, uint16_t resultsHttpPort) + : _sendChannel(sc), _tSeq(++taskSequence), _qId(ujData->getQueryId()), _templateId(templateId), @@ -141,6 +149,7 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun _czarId(ujData->getCzarId()), _scanInfo(scanInfo), _scanInteractive(scanInteractive), + _queryStats(queryStats_), _maxTableSize(maxTableSize * ::MB_SIZE_BYTES) { // These attributes will be passed back to Czar in the Protobuf response // to advice which result delivery channel to use. @@ -191,14 +200,15 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun } Task::~Task() { + /* &&& _userQueryInfo.reset(); UserQueryInfo::uqMapErase(_qId); if (UserQueryInfo::uqMapGet(_qId) == nullptr) { LOGS(_log, LOG_LVL_TRACE, "~Task Cleared uqMap entry for _qId=" << _qId); } + */ } - std::vector Task::createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, @@ -209,7 +219,9 @@ std::vector Task::createTasksForChunk( QueryId qId = ujData->getQueryId(); UberJobId ujId = ujData->getUberJobId(); - UserQueryInfo::Ptr userQueryInfo = UserQueryInfo::uqMapInsert(qId); + //&&&UserQueryInfo::Ptr userQueryInfo = UserQueryInfo::uqMapInsert(qId); + wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId); + UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); string funcN(__func__); funcN += " QID=" + to_string(qId) + " "; @@ -274,19 +286,35 @@ std::vector Task::createTasksForChunk( if (fragSubchunkIds.empty()) { bool const noSubchunks = false; int const subchunkId = -1; + /* &&& auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, userQueryInfo, templateId, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, - maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, resultsHttpPort)); + maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, queryStats, + resultsHttpPort)); + */ + auto task = Task::Ptr(new Task( + ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, + noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); + vect.push_back(task); } else { for (auto subchunkId : fragSubchunkIds) { bool const hasSubchunks = true; + auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, + fragmentNumber, templateId, hasSubchunks, subchunkId, + jdQuerySpecDb, scanInfo, scanInteractive, + maxTableSizeMb, fragSubTables, fragSubchunkIds, + sendChannel, queryStats, resultsHttpPort)); + /* &&& auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, userQueryInfo, templateId, hasSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, fragSubTables, - fragSubchunkIds, sendChannel, resultsHttpPort)); + fragSubchunkIds, sendChannel, queryStats, + resultsHttpPort)); + */ vect.push_back(task); } } @@ -340,13 +368,30 @@ void Task::action(util::CmdData* data) { } string Task::getQueryString() const { - string qs = _userQueryInfo->getTemplate(_templateId); + //&&&string qs = _userQueryInfo->getTemplate(_templateId); + auto qStats = _queryStats.lock(); + if (qStats == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _queryStats could not be locked"); + return string(""); + } + + // auto uQInfo = _userQueryInfo.lock(); + auto uQInfo = qStats->getUserQueryInfo(); + /* &&& + if (uQInfo == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _userQueryInfo could not be locked"); + return string(""); + } + */ + string qs = uQInfo->getTemplate(_templateId); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& a qs=" << qs); boost::algorithm::replace_all(qs, CHUNK_TAG, to_string(_chunkId)); boost::algorithm::replace_all(qs, SUBCHUNK_TAG, to_string(_subchunkId)); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& b qs=" << qs); return qs; } -void Task::setQueryStatistics(wpublish::QueryStatistics::Ptr const& qStats) { _queryStats = qStats; } +//&&&void Task::setQueryStatistics(wpublish::QueryStatistics::Ptr const& qStats) { _queryStats = qStats; } wpublish::QueryStatistics::Ptr Task::getQueryStats() const { auto qStats = _queryStats.lock(); diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 460a31c06..fa519732e 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -167,18 +167,29 @@ class Task : public util::CommandForThreadPool { bool operator()(Ptr const& x, Ptr const& y); }; + std::string cName(const char* func) const { return std::string("Task::") + func; } + // TODO:UJ too many parameters. // - fragmentNumber seems pointless // - hasSubchunks seems redundant. // Hopefully, many are the same for all tasks and can be moved to ujData and userQueryInfo. // Candidates: scanInfo, maxTableSizeMb, FileChannelShared, resultsHttpPort. // Unfortunately, this will be much easier if it is done after xrootd method is removed. + /* &&& Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, std::shared_ptr const& userQueryInfo, size_t templateId, bool hasSubchunks, int subchunkId, std::string const& db, proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSizeMb, std::vector const& fragSubTables, std::vector const& fragSubchunkIds, std::shared_ptr const& sc, + std::shared_ptr const& queryStats_, uint16_t resultsHttpPort = 8080); + */ + Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, + int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, std::string const& db, + proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSizeMb, + std::vector const& fragSubTables, std::vector const& fragSubchunkIds, + std::shared_ptr const& sc, + std::shared_ptr const& queryStats_, uint16_t resultsHttpPort = 8080); Task& operator=(const Task&) = delete; Task(const Task&) = delete; @@ -194,7 +205,7 @@ class Task : public util::CommandForThreadPool { std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort = 8080); - void setQueryStatistics(std::shared_ptr const& qC); + //&&&void setQueryStatistics(std::shared_ptr const& qC); std::shared_ptr getSendChannel() const { return _sendChannel; } void resetSendChannel() { _sendChannel.reset(); } ///< reset the shared pointer for FileChannelShared @@ -334,7 +345,7 @@ class Task : public util::CommandForThreadPool { } private: - std::shared_ptr _userQueryInfo; ///< Details common to Tasks in this UserQuery. + //&&&std::weak_ptr _userQueryInfo; ///< Details common to Tasks in this UserQuery. std::shared_ptr _sendChannel; ///< Send channel. uint64_t const _tSeq = 0; ///< identifier for the specific task @@ -372,6 +383,10 @@ class Task : public util::CommandForThreadPool { bool _scanInteractive; ///< True if the czar thinks this query should be interactive. bool _onInteractive{ false}; ///< True if the scheduler put this task on the interactive (group) scheduler. + + /// Stores information on the query's resource usage. + std::weak_ptr const _queryStats; + int64_t _maxTableSize = 0; std::atomic _memHandle{memman::MemMan::HandleType::INVALID}; memman::MemMan::Ptr _memMan; @@ -387,9 +402,6 @@ class Task : public util::CommandForThreadPool { std::chrono::system_clock::time_point _finishTime; ///< data transmission to Czar fiished size_t _totalSize = 0; ///< Total size of the result so far. - /// Stores information on the query's resource usage. - std::weak_ptr _queryStats; - std::atomic _mysqlThreadId{0}; ///< 0 if not connected to MySQL std::atomic _booted{false}; ///< Set to true if this task takes too long and is booted. diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index 64538fc6c..ac828fa4d 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -185,4 +185,12 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr lg(_ujTasksMtx); + for (auto const& task : _ujTasks) { + task->cancel(); + } +} + } // namespace lsst::qserv::wbase diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index f4ab4e303..03813979e 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -81,6 +81,7 @@ class UberJobData { /// Add the tasks defined in the UberJob to this UberJobData object. void addTasks(std::vector> const& tasks) { + std::lock_guard tLg(_ujTasksMtx); _ujTasks.insert(_ujTasks.end(), tasks.begin(), tasks.end()); } @@ -94,6 +95,9 @@ class UberJobData { std::string getIdStr() const { return _idStr; } std::string cName(std::string const& funcName) { return "UberJobData::" + funcName + " " + getIdStr(); } + /// &&& doc + void cancelAllTasks(); + private: UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, std::string czarHost, int czarPort, uint64_t queryId, std::string const& workerId, @@ -113,6 +117,8 @@ class UberJobData { std::vector> _ujTasks; std::shared_ptr _fileChannelShared; + std::mutex _ujTasksMtx; ///< Protects _ujTasks. + std::string const _idStr; }; diff --git a/src/wbase/UserQueryInfo.cc b/src/wbase/UserQueryInfo.cc index 79c24f07e..72d148060 100644 --- a/src/wbase/UserQueryInfo.cc +++ b/src/wbase/UserQueryInfo.cc @@ -39,45 +39,6 @@ namespace lsst::qserv::wbase { UserQueryInfo::UserQueryInfo(QueryId qId) : _qId(qId) {} -UserQueryInfo::Ptr UserQueryInfo::uqMapInsert(QueryId qId) { - Ptr uqi; - lock_guard lg(_uqMapMtx); - auto iter = _uqMap.find(qId); - if (iter != _uqMap.end()) { - uqi = iter->second.lock(); - } - // If uqi is invalid at this point, a new one needs to be made. - if (uqi == nullptr) { - uqi = make_shared(qId); - _uqMap[qId] = uqi; - } - return uqi; -} - -UserQueryInfo::Ptr UserQueryInfo::uqMapGet(QueryId qId) { - lock_guard lg(_uqMapMtx); - auto iter = _uqMap.find(qId); - if (iter != _uqMap.end()) { - return iter->second.lock(); - } - return nullptr; -} - -void UserQueryInfo::uqMapErase(QueryId qId) { - lock_guard lg(_uqMapMtx); - auto iter = _uqMap.find(qId); - if (iter != _uqMap.end()) { - // If the weak pointer has 0 real references - if (iter->second.expired()) { - _uqMap.erase(qId); - } - } -} - -UserQueryInfo::Map UserQueryInfo::_uqMap; - -mutex UserQueryInfo::_uqMapMtx; - size_t UserQueryInfo::addTemplate(std::string const& templateStr) { size_t j = 0; { @@ -108,4 +69,41 @@ void UserQueryInfo::addUberJob(std::shared_ptr const& ujData) { _uberJobMap[ujId] = ujData; } +/// &&& doc +void UserQueryInfo::cancelFromCzar() { + if (_cancelledByCzar.exchange(true)) { + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " already cancelledByCzar"); + return; + } + lock_guard lockUq(_uberJobMapMtx); + for (auto const& [ujId, weakUjPtr] : _uberJobMap) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " cancelling ujId=" << ujId); + auto ujPtr = weakUjPtr.lock(); + if (ujPtr != nullptr) { + ujPtr->cancelAllTasks(); + } + } +} + +/// &&& doc +void UserQueryInfo::cancelUberJob(UberJobId ujId) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " cancelling ujId=" << ujId); + lock_guard lockUq(_uberJobMapMtx); + _deadUberJobSet.insert(ujId); + auto iter = _uberJobMap.find(ujId); + if (iter != _uberJobMap.end()) { + auto weakUjPtr = iter->second; + auto ujPtr = weakUjPtr.lock(); + if (ujPtr != nullptr) { + ujPtr->cancelAllTasks(); + } + } +} + +bool UserQueryInfo::isUberJobDead(UberJobId ujId) const { + lock_guard lockUq(_uberJobMapMtx); + auto iter = _deadUberJobSet.find(ujId); + return iter != _deadUberJobSet.end(); +} + } // namespace lsst::qserv::wbase diff --git a/src/wbase/UserQueryInfo.h b/src/wbase/UserQueryInfo.h index 4b7a799f0..4694d8834 100644 --- a/src/wbase/UserQueryInfo.h +++ b/src/wbase/UserQueryInfo.h @@ -24,6 +24,7 @@ #define LSST_QSERV_WBASE_USERQUERYINFO_H // System headers +#include #include #include #include @@ -44,20 +45,18 @@ class UserQueryInfo { using Ptr = std::shared_ptr; using Map = std::map>; - static Ptr uqMapInsert(QueryId qId); - static Ptr uqMapGet(QueryId qId); - /// Erase the entry for `qId` in the map, as long as there are only - /// weak references to the UserQueryInfoObject. - /// Clear appropriate local and member references before calling this. - static void uqMapErase(QueryId qId); - - UserQueryInfo(QueryId qId); UserQueryInfo() = delete; UserQueryInfo(UserQueryInfo const&) = delete; UserQueryInfo& operator=(UserQueryInfo const&) = delete; + static Ptr create(QueryId qId) { return std::shared_ptr(new UserQueryInfo(qId)); } + ~UserQueryInfo() = default; + std::string cName(const char* func) { + return std::string("UserQueryInfo::") + func + " qId=" + std::to_string(_qId); + } + /// Add a query template to the map of templates for this user query. size_t addTemplate(std::string const& templateStr); @@ -68,9 +67,21 @@ class UserQueryInfo { /// Add an UberJobData object to the UserQueryInfo. void addUberJob(std::shared_ptr const& ujData); + /// &&& doc + bool getCancelledByCzar() const { return _cancelledByCzar; } + + /// &&& doc + void cancelFromCzar(); + + /// &&& doc + void cancelUberJob(UberJobId ujId); + + bool isUberJobDead(UberJobId ujId) const; + + QueryId getQueryId() const { return _qId; } + private: - static Map _uqMap; - static std::mutex _uqMapMtx; ///< protects _uqMap + UserQueryInfo(QueryId qId); QueryId const _qId; ///< The User Query Id number. @@ -78,11 +89,14 @@ class UserQueryInfo { /// This must be a vector. New entries are always added to the end so as not /// to alter existing indexes into the vector. std::vector _templates; - std::mutex _uqMtx; ///< protects _templates; + std::mutex _uqMtx; ///< protects _templates /// Map of all UberJobData objects on this worker for this User Query. - std::map> _uberJobMap; - std::mutex _uberJobMapMtx; ///< protects _uberJobMap; + std::map> _uberJobMap; + std::set _deadUberJobSet; ///< Set of cancelled UberJob Ids. + mutable std::mutex _uberJobMapMtx; ///< protects _uberJobMap, _deadUberJobSet + + std::atomic _cancelledByCzar{false}; }; } // namespace lsst::qserv::wbase diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index df3ed4063..288ed67e8 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -146,6 +146,8 @@ Foreman::~Foreman() { _httpServer->stop(); } +wpublish::QueryStatistics::Ptr Foreman::addQueryId(QueryId qId) { return _queries->addQueryId(qId); } + void Foreman::processTasks(vector const& tasks) { std::vector cmds; for (auto const& task : tasks) { diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index 6fe5ca439..5045cfe96 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -66,6 +66,7 @@ class QueryRunner; namespace lsst::qserv::wpublish { class ChunkInventory; class QueriesAndChunks; +class QueryStatistics; } // namespace lsst::qserv::wpublish // This header declarations @@ -128,11 +129,14 @@ class Foreman : public wbase::MsgProcessor { /// Process a group of query processing tasks. /// @see MsgProcessor::processTasks() - void processTasks(std::vector> const& tasks) override; // &&& delete + void processTasks(std::vector> const& tasks) override; // &&& delete /// Implement the corresponding method of the base class /// @see MsgProcessor::processCommand() - void processCommand(std::shared_ptr const& command) override; // &&& delete + void processCommand(std::shared_ptr const& command) override; // &&& delete + + /// &&& doc + std::shared_ptr addQueryId(QueryId qId); /// Implement the corresponding method of the base class /// @see MsgProcessor::statusToJson() diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index 0e73f664d..eb76be91b 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -281,7 +281,8 @@ bool QueryRunner::_dispatchChannel() { if (taskSched != nullptr) { taskSched->histTimeOfRunningTasks->addEntry(primeT.getElapsed()); LOGS(_log, LOG_LVL_DEBUG, "QR " << taskSched->histTimeOfRunningTasks->getString("run")); - LOGS(_log, LOG_LVL_WARN, "&&&DASH QR " << taskSched->histTimeOfRunningTasks->getString("run")); + LOGS(_log, LOG_LVL_WARN, + "&&&DASH QR " << taskSched->histTimeOfRunningTasks->getString("run")); } else { LOGS(_log, LOG_LVL_ERROR, "QR runtaskSched == nullptr"); LOGS(_log, LOG_LVL_ERROR, "&&&DASH QR runtaskSched == nullptr"); diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index 276beaace..c59182858 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -146,7 +146,6 @@ struct Fixture { {"tblScanRating", mInfo.scanRating}}; chunkScanTables.push_back(move(cst)); - auto& jsFragments = jsJobMsg["queryFragments"]; /* &&& if (chunkQuerySpec.nextFragment.get()) { @@ -170,8 +169,8 @@ struct Fixture { for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); } - _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, - chunkQuerySpec.queries); + _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, + chunkQuerySpec.subChunkIds, chunkQuerySpec.queries); } */ nlohmann::json jsFrag = {{"resultTable", mInfo.resultName}, @@ -230,22 +229,20 @@ BOOST_AUTO_TEST_CASE(Simple) { MsgInfo mInfo; auto msgJson = newTaskJson(mInfo); shared_ptr sendC(SendChannel::newNopChannel()); - auto sc = FileChannelShared::create(sendC, mInfo.czarId); + auto sChannel = FileChannelShared::create(sendC, mInfo.czarId); FakeBackend::Ptr backend = make_shared(); shared_ptr crm = ChunkResourceMgr::newMgr(backend); SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, - mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, + mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); lsst::qserv::proto::ScanInfo scanInfo; scanInfo.scanRating = mInfo.scanRating; scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); - vector taskVect = Task::createTasksForChunk( - ujData, *msgJson, sc, scanInfo, - mInfo.scanInteractive, mInfo.maxTableSize, - crm, - newMySqlConfig(), sqlConnMgr, - queries); + vector taskVect = + Task::createTasksForChunk(ujData, *msgJson, sChannel, scanInfo, mInfo.scanInteractive, + mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); @@ -278,20 +275,17 @@ BOOST_AUTO_TEST_CASE(Output) { SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, - mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, + mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); lsst::qserv::proto::ScanInfo scanInfo; scanInfo.scanRating = mInfo.scanRating; scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); - vector taskVect = Task::createTasksForChunk( - ujData, *msgJson, sc, scanInfo, - mInfo.scanInteractive, mInfo.maxTableSize, - crm, - newMySqlConfig(), sqlConnMgr, - queries); + vector taskVect = + Task::createTasksForChunk(ujData, *msgJson, sc, scanInfo, mInfo.scanInteractive, + mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); - } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/wpublish/QueriesAndChunks.cc b/src/wpublish/QueriesAndChunks.cc index 2499a6267..940be6698 100644 --- a/src/wpublish/QueriesAndChunks.cc +++ b/src/wpublish/QueriesAndChunks.cc @@ -119,21 +119,39 @@ void QueriesAndChunks::setBlendScheduler(shared_ptr cons void QueriesAndChunks::setRequiredTasksCompleted(unsigned int value) { _requiredTasksCompleted = value; } +QueryStatistics::Ptr QueriesAndChunks::addQueryId(QueryId qId) { + unique_lock guardStats(_queryStatsMapMtx); + auto itr = _queryStatsMap.find(qId); + QueryStatistics::Ptr stats; + if (_queryStatsMap.end() == itr) { + stats = QueryStatistics::create(qId); + _queryStatsMap[qId] = stats; + } else { + stats = itr->second; + } + return stats; +} + /// Add statistics for the Task, creating a QueryStatistics object if needed. void QueriesAndChunks::addTask(wbase::Task::Ptr const& task) { auto qid = task->getQueryId(); +#if 0 // &&& delete upper block unique_lock guardStats(_queryStatsMapMtx); auto itr = _queryStatsMap.find(qid); QueryStatistics::Ptr stats; if (_queryStatsMap.end() == itr) { stats = QueryStatistics::create(qid); _queryStatsMap[qid] = stats; + throw util::Bug(ERR_LOC, "&&& QueriesAndChunks::addTask entry should already be there"); // &&& replace with error message ??? } else { stats = itr->second; } guardStats.unlock(); +#else // &&& + auto stats = addQueryId(qid); +#endif // &&& stats->addTask(task); - task->setQueryStatistics(stats); + //&&&task->setQueryStatistics(stats); } /// Update statistics for the Task that was just queued. diff --git a/src/wpublish/QueriesAndChunks.h b/src/wpublish/QueriesAndChunks.h index a51e1d24d..83bcddf36 100644 --- a/src/wpublish/QueriesAndChunks.h +++ b/src/wpublish/QueriesAndChunks.h @@ -193,9 +193,14 @@ class QueriesAndChunks { void removeDead(); void removeDead(QueryStatistics::Ptr const& queryStats); - /// Return the statistics for a user query. + /// Return the statistics for a user query, may be nullptr + /// @see addQueryId() QueryStatistics::Ptr getStats(QueryId const& qId) const; + /// Return the statistics for a user query, creating if needed. + /// @see getStats() + QueryStatistics::Ptr addQueryId(QueryId qId); + void addTask(wbase::Task::Ptr const& task); void queuedTask(wbase::Task::Ptr const& task); void startedTask(wbase::Task::Ptr const& task); diff --git a/src/wpublish/QueryStatistics.cc b/src/wpublish/QueryStatistics.cc index 576effdee..607288658 100644 --- a/src/wpublish/QueryStatistics.cc +++ b/src/wpublish/QueryStatistics.cc @@ -50,7 +50,8 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wpublish.QueriesAndChunks"); namespace lsst::qserv::wpublish { -QueryStatistics::QueryStatistics(QueryId const& qId_) : creationTime(CLOCK::now()), queryId(qId_) { +QueryStatistics::QueryStatistics(QueryId const& qId_) + : creationTime(CLOCK::now()), queryId(qId_), _userQueryInfo(wbase::UserQueryInfo::create(qId_)) { /// For all of the histograms, all entries should be kept at least until the work is finished. string qidStr = to_string(queryId); _histSizePerTask = util::Histogram::Ptr(new util::Histogram( @@ -186,6 +187,13 @@ QueryStatistics::SchedTasksInfoMap QueryStatistics::getSchedulerTasksInfoMap() { return _taskSchedInfoMap; } +/* &&& +void QueryStatistics::touch(TIMEPOINT const now) { + lock_guard lock(_qStatsMtx); + _touched = now; +} +*/ + void QueryStatistics::addTask(TIMEPOINT const now) { lock_guard lock(_qStatsMtx); _touched = now; diff --git a/src/wpublish/QueryStatistics.h b/src/wpublish/QueryStatistics.h index dc26a9da4..dbacd5d53 100644 --- a/src/wpublish/QueryStatistics.h +++ b/src/wpublish/QueryStatistics.h @@ -41,10 +41,12 @@ #include "global/intTypes.h" #include "wbase/Task.h" #include "wsched/SchedulerBase.h" +#include "util/InstanceCount.h" //&&& namespace lsst::qserv::wbase { -class Histogram; -} +//&&&class Histogram; +class UserQueryInfo; +} // namespace lsst::qserv::wbase // This header declarations namespace lsst::qserv::wpublish { @@ -73,6 +75,8 @@ class QueryStatistics { return _queryBooted; } + std::shared_ptr getUserQueryInfo() const { return _userQueryInfo; } + void setQueryBooted(bool booted, TIMEPOINT now); /// Add statistics related to the running of the query in the task. @@ -93,6 +97,7 @@ class QueryStatistics { void addTaskTransmit(double timeSeconds, int64_t bytesTransmitted, int64_t rowsTransmitted, double bufferFillSecs); + //&&&void touch(TIMEPOINT const now); void addTask(TIMEPOINT const now); void addTaskRunning(TIMEPOINT const now); bool addTaskCompleted(TIMEPOINT const now, double const taskDuration); @@ -194,6 +199,9 @@ class QueryStatistics { std::shared_ptr _histRowsPerTask; ///< Histogram of rows per Task. SchedTasksInfoMap _taskSchedInfoMap; ///< Map of task information ordered by scheduler name. + + std::shared_ptr const _userQueryInfo; ///< &&& doc + util::InstanceCount _ic{"QueryStatiscs_&&&"}; }; } // namespace lsst::qserv::wpublish diff --git a/src/wsched/testSchedulers.cc b/src/wsched/testSchedulers.cc index 2b3c4df5b..4bf41ec08 100644 --- a/src/wsched/testSchedulers.cc +++ b/src/wsched/testSchedulers.cc @@ -162,7 +162,6 @@ struct SchedulerFixture { } */ - int counter; }; @@ -222,7 +221,7 @@ struct SchedFixture { // TODO: DM-33302 replace this test case BOOST_AUTO_TEST_CASE(Grouping) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable SchedFixture f(60.0, 1); // Values to keep QueriesAndChunk from triggering. LOGS(_log, LOG_LVL_DEBUG, "Test_case grouping"); @@ -307,7 +306,7 @@ BOOST_AUTO_TEST_CASE(Grouping) { } BOOST_AUTO_TEST_CASE(GroupMaxThread) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable // Test that maxThreads is meaningful. LOGS(_log, LOG_LVL_WARN, "Test_case GroupMaxThread"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, @@ -342,7 +341,7 @@ BOOST_AUTO_TEST_CASE(GroupMaxThread) { } BOOST_AUTO_TEST_CASE(ScanScheduleTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ScanScheduleTest"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -407,7 +406,7 @@ BOOST_AUTO_TEST_CASE(ScanScheduleTest) { } BOOST_AUTO_TEST_CASE(BlendScheduleTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case BlendScheduleTest"); // Test that space is appropriately reserved for each scheduler as Tasks are started and finished. // In this case, memMan->lock(..) always returns true (really HandleType::ISEMPTY). @@ -606,11 +605,11 @@ BOOST_AUTO_TEST_CASE(BlendScheduleTest) { BOOST_CHECK(f.blend->calcAvailableTheads() == 5); BOOST_CHECK(f.blend->getInFlight() == 0); LOGS(_log, LOG_LVL_DEBUG, "BlendScheduleTest-1 done"); -#endif // &&& fix and re-enable +#endif // &&& fix and re-enable } BOOST_AUTO_TEST_CASE(BlendScheduleThreadLimitingTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case BlendScheduleThreadLimitingTest"); SchedFixture f(60.0, 1); // Values to keep QueriesAndChunk from triggering. // Test that only 6 threads can be started on a single ScanScheduler @@ -682,7 +681,7 @@ BOOST_AUTO_TEST_CASE(BlendScheduleThreadLimitingTest) { } BOOST_AUTO_TEST_CASE(BlendScheduleQueryRemovalTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable // Test that space is appropriately reserved for each scheduler as Tasks are started and finished. // In this case, memMan->lock(..) always returns true (really HandleType::ISEMPTY). // ChunkIds matter as they control the order Tasks come off individual schedulers. @@ -744,7 +743,7 @@ BOOST_AUTO_TEST_CASE(BlendScheduleQueryRemovalTest) { } BOOST_AUTO_TEST_CASE(BlendScheduleQueryBootTaskTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable // Test if a task is removed if it takes takes too long. // Give the user query 0.1 seconds to run and run it for a second, it should get removed. double tenthOfSecInMinutes = 1.0 / 600.0; // task @@ -830,7 +829,7 @@ BOOST_AUTO_TEST_CASE(BlendScheduleQueryBootTaskTest) { } BOOST_AUTO_TEST_CASE(SlowTableHeapTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case SlowTableHeapTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -867,7 +866,7 @@ BOOST_AUTO_TEST_CASE(SlowTableHeapTest) { } BOOST_AUTO_TEST_CASE(ChunkTasksTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ChunkTasksTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); @@ -942,7 +941,7 @@ BOOST_AUTO_TEST_CASE(ChunkTasksTest) { } BOOST_AUTO_TEST_CASE(ChunkTasksQueueTest) { -#if 0 // &&& fix and re-enable +#if 0 // &&& fix and re-enable LOGS(_log, LOG_LVL_DEBUG, "Test_case ChunkTasksQueueTest start"); auto queries = QueriesAndChunks::setupGlobal(chrono::seconds(1), chrono::seconds(300), maxBootedC, maxDarkTasksC, resetForTestingC); diff --git a/src/xrdreq/QueryManagementAction.h b/src/xrdreq/QueryManagementAction.h index ec5ff9158..c624ecf88 100644 --- a/src/xrdreq/QueryManagementAction.h +++ b/src/xrdreq/QueryManagementAction.h @@ -39,7 +39,7 @@ namespace lsst::qserv::xrdreq { * Class QueryManagementAction is an interface for managing query completion/cancellation * at all Qserv workers that are connected as "publishers" to the XROOTD redirector. */ -// &&& need to get the same functionality using json messages, and not in xrdreq. +// &&&QM need to get the same functionality using json messages, and not in xrdreq. class QueryManagementAction : public std::enable_shared_from_this { public: /// The reponse type represents errors reported by the workers, where worker diff --git a/src/xrdreq/QueryManagementRequest.h b/src/xrdreq/QueryManagementRequest.h index 9c92fcfe6..0e366afe2 100644 --- a/src/xrdreq/QueryManagementRequest.h +++ b/src/xrdreq/QueryManagementRequest.h @@ -41,7 +41,7 @@ namespace lsst::qserv::xrdreq { * the error messages in case of any problems in delivering or processing * notifications. */ -class QueryManagementRequest : public QservRequest { +class QueryManagementRequest : public QservRequest { //&&&QM public: /// The pointer type for instances of the class typedef std::shared_ptr Ptr; diff --git a/src/xrdsvc/ChannelStream.h b/src/xrdsvc/ChannelStream.h index 61c8777e7..db9290fb9 100644 --- a/src/xrdsvc/ChannelStream.h +++ b/src/xrdsvc/ChannelStream.h @@ -40,7 +40,7 @@ namespace lsst::qserv::xrdsvc { /// ChannelStream is an implementation of an XrdSsiStream that accepts /// SendChannel streamed data. -class ChannelStream : public XrdSsiStream { // &&& delete +class ChannelStream : public XrdSsiStream { // &&& delete public: ChannelStream(); virtual ~ChannelStream(); diff --git a/src/xrdsvc/HttpSvc.cc b/src/xrdsvc/HttpSvc.cc index 392f5e6b8..0908efcaa 100644 --- a/src/xrdsvc/HttpSvc.cc +++ b/src/xrdsvc/HttpSvc.cc @@ -138,13 +138,13 @@ uint16_t HttpSvc::start() { _httpServerPtr->addHandlers( {{"POST", "/queryjob", [self](shared_ptr const& req, shared_ptr const& resp) { - HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "QUERYJOB", + HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "/queryjob", http::AuthType::REQUIRED); }}}); _httpServerPtr->addHandlers( {{"POST", "/querystatus", [self](shared_ptr const& req, shared_ptr const& resp) { - HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "QUERYSTATUS", + HttpWorkerCzarModule::process(::serviceName, self->_foreman, req, resp, "/querystatus", http::AuthType::REQUIRED); }}}); _httpServerPtr->start(); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 856bd4455..3408aa4cd 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -36,6 +36,7 @@ #include "http/MetaModule.h" #include "http/RequestBodyJSON.h" #include "http/RequestQuery.h" +#include "http/WorkerQueryStatusData.h" #include "mysql/MySqlUtils.h" #include "qmeta/types.h" #include "util/String.h" @@ -48,6 +49,8 @@ #include "wcontrol/Foreman.h" #include "wcontrol/ResourceMonitor.h" #include "wpublish/ChunkInventory.h" +#include "wpublish/QueriesAndChunks.h" +#include "wpublish/QueryStatistics.h" #include "xrdsvc/SsiProvider.h" #include "xrdsvc/XrdName.h" @@ -88,8 +91,8 @@ json HttpWorkerCzarModule::executeImpl(string const& subModuleName) { string const func = string(__func__) + "[sub-module='" + subModuleName + "']"; enforceInstanceId(func, wconfig::WorkerConfig::instance()->replicationInstanceId()); enforceWorkerId(func); - if (subModuleName == "QUERYJOB") return _queryJob(); - if (subModuleName == "QUERYSTATUS") return _queryStatus(); + if (subModuleName == "/queryjob") return _queryJob(); + if (subModuleName == "/querystatus") return _queryStatus(); throw invalid_argument(context() + func + " unsupported sub-module"); } @@ -126,13 +129,24 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { LOGS(_log, LOG_LVL_TRACE, __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId); + // Get or create QueryStatistics and UserQueryInfo instances. + auto queryStats = foreman()->addQueryId(ujQueryId); + auto userQueryInfo = queryStats->getUserQueryInfo(); + + if (userQueryInfo->getCancelledByCzar()) { + throw wbase::TaskException( + ERR_LOC, string("Already cancelled by czar. ujQueryId=") + to_string(ujQueryId)); + } + if (userQueryInfo->isUberJobDead(ujId)) { + throw wbase::TaskException(ERR_LOC, string("UberJob already dead. ujQueryId=") + + to_string(ujQueryId) + " ujId=" + to_string(ujId)); + } + auto ujData = wbase::UberJobData::create(ujId, czarName, czarId, czarHostName, czarPort, ujQueryId, targetWorkerId, foreman(), authKey()); // Find the entry for this queryId, creat a new one if needed. - wbase::UserQueryInfo::Ptr userQueryInfo = wbase::UserQueryInfo::uqMapInsert(ujQueryId); userQueryInfo->addUberJob(ujData); - auto channelShared = wbase::FileChannelShared::create(ujData, czarId, czarHostName, czarPort, targetWorkerId); ujData->setFileChannelShared(channelShared); @@ -220,9 +234,100 @@ json HttpWorkerCzarModule::_queryStatus() { } json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { - LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE HttpWorkerCzarModule::_handleQueryStatus"); - throw util::Bug(ERR_LOC, "&&& NEED CODE HttpWorkerCzarModule::_handleQueryStatus"); -} + LOGS(_log, LOG_LVL_ERROR, "&&& HttpWorkerCzarModule::_handleQueryStatus"); + + json jsRet; + auto now = CLOCK::now(); + auto const workerConfig = wconfig::WorkerConfig::instance(); + auto const replicationInstanceId = workerConfig->replicationInstanceId(); + auto const replicationAuthKey = workerConfig->replicationAuthKey(); + + auto const& jsReq = body().objJson; + auto wqsData = + http::WorkerQueryStatusData::createJson(jsReq, replicationInstanceId, replicationAuthKey, now); + + // For all queryId and czarId items, if the item can't be found, it is simply ignored. Anything that + // is missed will eventually be picked up by other mechanisms, such as results being rejected + // by the czar. + + // If a czar was restarted, cancel and/or delete the abandoned items. + if (wqsData->isCzarRestart()) { + auto restartCzarId = wqsData->getCzarRestartCzarId(); + auto restartQId = wqsData->getCzarRestartQueryId(); + if (restartCzarId > 0 && restartQId > 0) { + wbase::FileChannelShared::cleanUpResultsOnCzarRestart(wqsData->getCzarRestartCzarId(), + wqsData->getCzarRestartQueryId()); + } + } + + // Take the values from the lists in the message to cancel the + // appropriate queries and tasks as needed. + auto const queriesAndChunks = foreman()->queriesAndChunks(); + vector cancelledList; + // Cancelled queries where we want to keep the files + lock_guard mapLg(wqsData->_mapMtx); + for (auto const& [dkQid, dkTm] : wqsData->_qIdDoneKeepFiles) { + auto qStats = queriesAndChunks->addQueryId(dkQid); + if (qStats != nullptr) { + auto uqInfo = qStats->getUserQueryInfo(); + if (uqInfo != nullptr) { + if (!uqInfo->getCancelledByCzar()) { + cancelledList.push_back(uqInfo); + } + } + } + } + vector deleteFilesList; + for (auto const& [dkQid, dkTm] : wqsData->_qIdDoneDeleteFiles) { + auto qStats = queriesAndChunks->addQueryId(dkQid); + if (qStats != nullptr) { + auto uqInfo = qStats->getUserQueryInfo(); + if (uqInfo != nullptr) { + if (!uqInfo->getCancelledByCzar()) { + cancelledList.push_back(uqInfo); + } + deleteFilesList.push_back(uqInfo); + } + } + } + + // Cancel everything in the cancelled list. + for (auto const& canUqInfo : cancelledList) { + canUqInfo->cancelFromCzar(); + } + + // For dead UberJobs, add them to a list of dead uberjobs within UserQueryInfo. + // UserQueryInfo will cancel the tasks in the uberjobs if they exist. + // New UberJob Id's will be checked against the list, and immediately be + // killed if they are on it. (see HttpWorkerCzarModule::_handleQueryJob) + for (auto const& [ujQid, ujIdMap] : wqsData->_qIdDeadUberJobs) { + auto qStats = queriesAndChunks->addQueryId(ujQid); + if (qStats != nullptr) { + auto uqInfo = qStats->getUserQueryInfo(); + if (uqInfo != nullptr) { + if (!uqInfo->getCancelledByCzar()) { + for (auto const& [ujId, tm] : ujIdMap) { + uqInfo->cancelUberJob(ujId); + } + } + } + } + } + + // Delete files that should be deleted + CzarIdType czarId = wqsData->_czInfo->czId; + for (wbase::UserQueryInfo::Ptr uqiPtr : deleteFilesList) { + if (uqiPtr == nullptr) continue; + QueryId qId = uqiPtr->getQueryId(); + wbase::FileChannelShared::cleanUpResults(czarId, qId); + } + + // Syntax errors in the message would throw invalid_argument, which is handled elsewhere. + + // Return a message containing lists of the queries that were cancelled. + jsRet = wqsData->serializeResponseJson(); + return jsRet; +} } // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/SsiRequest.cc b/src/xrdsvc/SsiRequest.cc index 1b4ca9aeb..724c098f9 100644 --- a/src/xrdsvc/SsiRequest.cc +++ b/src/xrdsvc/SsiRequest.cc @@ -115,20 +115,22 @@ void SsiRequest::execute(XrdSsiRequest& req) { // Process the request switch (ru.unitType()) { - case ResourceUnit::DBCHUNK: { // &&& delete + case ResourceUnit::DBCHUNK: { // &&& delete // Increment the counter of the database/chunk resources in use - _foreman->resourceMonitor()->increment(_resourceName); // &&& TODO:UJ make sure this is implemented elsewhere. + _foreman->resourceMonitor()->increment( + _resourceName); // &&& TODO:UJ make sure this is implemented elsewhere. reportError("&&& DBCHUNK requests are no longer available resource db=" + ru.db() + - " chunkId=" + std::to_string(ru.chunk())); + " chunkId=" + std::to_string(ru.chunk())); + throw util::Bug(ERR_LOC, "&&& ResourceUnit::DBCHUNK"); break; } - case ResourceUnit::QUERY: { // &&& delete + case ResourceUnit::QUERY: { // &&& delete LOGS(_log, LOG_LVL_DEBUG, "Parsing request details for resource=" << _resourceName); reportError("&&& QUERY requests are no longer available"); - /* &&& + /* &&&QM proto::QueryManagement request; try { // reqData has the entire request, so we can unpack it without waiting for @@ -187,7 +189,6 @@ void SsiRequest::execute(XrdSsiRequest& req) { } // Note that upon exit the _finMutex will be unlocked allowing Finished() // to actually do something once everything is actually setup. - } /// Called by SSI to free resources. diff --git a/src/xrdsvc/SsiRequest.h b/src/xrdsvc/SsiRequest.h index 5850d18bf..29a600bd3 100644 --- a/src/xrdsvc/SsiRequest.h +++ b/src/xrdsvc/SsiRequest.h @@ -60,7 +60,8 @@ class StreamBuffer; /// qserv worker services. The SSI interface encourages such an approach, and /// object lifetimes are explicitly stated in the documentation which we /// adhere to using BindRequest() and UnBindRequest() responder methods. -class SsiRequest : public XrdSsiResponder, public std::enable_shared_from_this { // &&& delete if possible +class SsiRequest : public XrdSsiResponder, + public std::enable_shared_from_this { // &&& delete if possible public: // Smart pointer definitions From 68d591a7c06512ec461c5586e98543294017eb79 Mon Sep 17 00:00:00 2001 From: John Gates Date: Tue, 10 Sep 2024 14:45:48 -0700 Subject: [PATCH 06/22] More cancellation code added. --- src/CMakeLists.txt | 2 - src/ccontrol/CMakeLists.txt | 2 - src/ccontrol/UserQuerySelect.cc | 23 +- src/czar/ActiveWorker.cc | 156 ++++++------ src/czar/ActiveWorker.h | 19 +- src/czar/Czar.cc | 34 +-- src/czar/Czar.h | 12 +- src/czar/CzarChunkMap.cc | 2 - src/czar/CzarRegistry.cc | 32 ++- src/czar/CzarRegistry.h | 9 +- src/global/clock_defs.h | 4 + src/http/WorkerQueryStatusData.cc | 214 +++++++++------- src/http/WorkerQueryStatusData.h | 279 ++++++++++++++------- src/http/testStatusData.cc | 50 ++-- src/proto/worker.proto | 33 --- src/qana/CMakeLists.txt | 1 - src/qdisp/CMakeLists.txt | 1 - src/qdisp/Executive.cc | 21 ++ src/qdisp/Executive.h | 9 +- src/qdisp/JobBase.h | 2 +- src/qdisp/UberJob.cc | 46 +++- src/qdisp/UberJob.h | 17 +- src/qdisp/testQDisp.cc | 4 +- src/qproc/CMakeLists.txt | 1 - src/query/CMakeLists.txt | 1 - src/replica/CMakeLists.txt | 1 - src/rproc/CMakeLists.txt | 1 - src/wbase/CMakeLists.txt | 1 - src/wbase/FileChannelShared.cc | 3 +- src/wbase/MsgProcessor.h | 66 ----- src/wbase/SendChannel.cc | 80 +----- src/wbase/SendChannel.h | 49 +--- src/wbase/Task.cc | 31 +-- src/wbase/Task.h | 46 +--- src/wbase/WorkerCommand.cc | 49 ---- src/wbase/WorkerCommand.h | 96 -------- src/wconfig/WorkerConfig.h | 6 +- src/wcontrol/Foreman.cc | 5 - src/wcontrol/Foreman.h | 23 +- src/wdb/QueryRunner.cc | 15 +- src/wdb/QueryRunner.h | 11 +- src/xrdreq/CMakeLists.txt | 45 ---- src/xrdreq/QservRequest.cc | 216 ----------------- src/xrdreq/QservRequest.h | 120 --------- src/xrdreq/QueryManagementAction.cc | 137 ----------- src/xrdreq/QueryManagementAction.h | 97 -------- src/xrdreq/QueryManagementRequest.cc | 91 ------- src/xrdreq/QueryManagementRequest.h | 95 -------- src/xrdreq/qserv-query-management.cc | 154 ------------ src/xrdsvc/CMakeLists.txt | 3 - src/xrdsvc/ChannelStream.cc | 115 --------- src/xrdsvc/ChannelStream.h | 75 ------ src/xrdsvc/HttpReplicaMgtModule.cc | 11 +- src/xrdsvc/HttpReplicaMgtModule.h | 6 +- src/xrdsvc/HttpWorkerCzarModule.cc | 16 +- src/xrdsvc/SsiRequest.cc | 349 --------------------------- src/xrdsvc/SsiRequest.h | 129 ---------- src/xrdsvc/SsiService.cc | 13 +- src/xrdsvc/StreamBuffer.cc | 176 -------------- src/xrdsvc/StreamBuffer.h | 128 ---------- 60 files changed, 671 insertions(+), 2762 deletions(-) delete mode 100644 src/wbase/MsgProcessor.h delete mode 100644 src/wbase/WorkerCommand.cc delete mode 100644 src/wbase/WorkerCommand.h delete mode 100644 src/xrdreq/CMakeLists.txt delete mode 100644 src/xrdreq/QservRequest.cc delete mode 100644 src/xrdreq/QservRequest.h delete mode 100644 src/xrdreq/QueryManagementAction.cc delete mode 100644 src/xrdreq/QueryManagementAction.h delete mode 100644 src/xrdreq/QueryManagementRequest.cc delete mode 100644 src/xrdreq/QueryManagementRequest.h delete mode 100644 src/xrdreq/qserv-query-management.cc delete mode 100644 src/xrdsvc/ChannelStream.cc delete mode 100644 src/xrdsvc/ChannelStream.h delete mode 100644 src/xrdsvc/SsiRequest.cc delete mode 100644 src/xrdsvc/SsiRequest.h delete mode 100644 src/xrdsvc/StreamBuffer.cc delete mode 100644 src/xrdsvc/StreamBuffer.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 79fcdc26d..f5d8a98ee 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,6 @@ add_subdirectory(wpublish) add_subdirectory(wsched) add_subdirectory(www) add_subdirectory(xrdlog) -add_subdirectory(xrdreq) add_subdirectory(xrdsvc) #----------------------------------------------------------------------------- @@ -143,7 +142,6 @@ target_link_libraries(qserv_czar PUBLIC rproc qserv_css qserv_meta - xrdreq ) install( diff --git a/src/ccontrol/CMakeLists.txt b/src/ccontrol/CMakeLists.txt index 60a042e2c..da91bee4c 100644 --- a/src/ccontrol/CMakeLists.txt +++ b/src/ccontrol/CMakeLists.txt @@ -32,7 +32,6 @@ target_link_libraries(ccontrol PUBLIC parser replica sphgeom - xrdreq XrdCl ) @@ -51,7 +50,6 @@ FUNCTION(ccontrol_tests) qserv_meta query rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 7627fb960..b432ddd15 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -108,7 +108,6 @@ #include "util/Bug.h" #include "util/IterableFormatter.h" #include "util/ThreadPriority.h" -#include "xrdreq/QueryManagementAction.h" #include "qdisp/UberJob.h" namespace { @@ -453,7 +452,8 @@ void UserQuerySelect::buildAndSendUberJobs() { } // Add worker contact info to UberJobs. - auto const wContactMap = czRegistry->getWorkerContactMap(); + //&&& auto const wContactMap = czRegistry->getWorkerContactMap(); + auto const wContactMap = czRegistry->waitForWorkerContactMap(); //&&&Z LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); for (auto const& [wIdKey, ujVect] : workerJobMap) { auto iter = wContactMap->find(wIdKey); @@ -505,9 +505,7 @@ QueryState UserQuerySelect::join() { // finalRows < 0 indicates there was no postprocessing, so collected rows and final rows should be the // same. if (finalRows < 0) finalRows = collectedRows; - // Notify workers on the query completion/cancellation to ensure - // resources are properly cleaned over there as well. - proto::QueryManagement::Operation operation = proto::QueryManagement::COMPLETE; //&&&QM + QueryState state = SUCCESS; if (successful) { _qMetaUpdateStatus(qmeta::QInfo::COMPLETED, collectedRows, collectedBytes, finalRows); @@ -515,24 +513,17 @@ QueryState UserQuerySelect::join() { } else if (_killed) { // status is already set to ABORTED LOGS(_log, LOG_LVL_ERROR, "Joined everything (killed)"); - operation = proto::QueryManagement::CANCEL; //&&&QM state = ERROR; } else { _qMetaUpdateStatus(qmeta::QInfo::FAILED, collectedRows, collectedBytes, finalRows); LOGS(_log, LOG_LVL_ERROR, "Joined everything (failure!)"); - operation = proto::QueryManagement::CANCEL; //&&&QM state = ERROR; } auto const czarConfig = cconfig::CzarConfig::instance(); - if (czarConfig->notifyWorkersOnQueryFinish()) { - try { - // &&& do this another way, also see executive::squash &&&QM - xrdreq::QueryManagementAction::notifyAllWorkers(czarConfig->getXrootdFrontendUrl(), operation, - _qMetaCzarId, _qMetaQueryId); - } catch (std::exception const& ex) { - LOGS(_log, LOG_LVL_WARN, ex.what()); - } - } + + // Notify workers on the query completion/cancellation to ensure + // resources are properly cleaned over there as well. + czar::Czar::getCzar()->getActiveWorkerMap()->addToDoneDeleteFiles(_executive->getId()); return state; } diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 39aa042ef..a5a745c2e 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -27,6 +27,7 @@ // Qserv headers #include "cconfig/CzarConfig.h" +#include "czar/Czar.h" #include "http/Client.h" #include "http/MetaModule.h" #include "util/common.h" @@ -43,15 +44,6 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.ActiveWorker"); namespace lsst::qserv::czar { -/* &&& -string WorkerContactInfo::dump() const { - stringstream os; - os << "workerContactInfo{" - << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; - return os.str(); -} -*/ - string ActiveWorker::getStateStr(State st) { switch (st) { case ALIVE: @@ -66,13 +58,15 @@ string ActiveWorker::getStateStr(State st) { bool ActiveWorker::compareContactInfo(http::WorkerContactInfo const& wcInfo) const { lock_guard lg(_aMtx); - return _wqsData->_wInfo->isSameContactInfo(wcInfo); + auto wInfo_ = _wqsData->getWInfo(); + if (wInfo_ == nullptr) return false; + return wInfo_->isSameContactInfo(wcInfo); } void ActiveWorker::setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " new info=" << wcInfo->dump()); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " new info=" << wcInfo->dump()); lock_guard lg(_aMtx); - _wqsData->_wInfo = wcInfo; + _wqsData->setWInfo(wcInfo); } void ActiveWorker::_changeStateTo(State newState, double secsSinceUpdate, string const& note) { @@ -85,45 +79,64 @@ void ActiveWorker::_changeStateTo(State newState, double secsSinceUpdate, string void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { - lock_guard lg(_aMtx); - double secsSinceUpdate = _wqsData->_wInfo->timeSinceRegUpdateSeconds(); - // Update the last time the registry contacted this worker. - switch (_state) { - case ALIVE: { - if (secsSinceUpdate > timeoutAliveSecs) { - _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); - // &&& Anything else that should be done here? - } - break; + bool newlyDeadWorker = false; + http::WorkerContactInfo::Ptr wInfo_; + { + lock_guard lg(_aMtx); + wInfo_ = _wqsData->getWInfo(); + if (wInfo_ == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " no WorkerContactInfo"); + return; } - case QUESTIONABLE: { - if (secsSinceUpdate < timeoutAliveSecs) { - _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + double secsSinceUpdate = (wInfo_ == nullptr) ? timeoutDeadSecs : wInfo_->timeSinceRegUpdateSeconds(); + + // Update the last time the registry contacted this worker. + switch (_state) { + case ALIVE: { + if (secsSinceUpdate >= timeoutAliveSecs) { + _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); + // &&& Anything else that should be done here? + } + break; } - if (secsSinceUpdate > timeoutDeadSecs) { - _changeStateTo(DEAD, secsSinceUpdate, cName(__func__)); - // &&& TODO:UJ all uberjobs for this worker need to die. + case QUESTIONABLE: { + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + } + if (secsSinceUpdate >= timeoutDeadSecs) { + _changeStateTo(DEAD, secsSinceUpdate, cName(__func__)); + // All uberjobs for this worker need to die. + newlyDeadWorker = true; + } + break; } - break; - } - case DEAD: { - LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE"); - if (secsSinceUpdate < timeoutAliveSecs) { - _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); - } else { - // Don't waste time on this worker until the registry has heard from it. - return; + case DEAD: { + if (secsSinceUpdate < timeoutAliveSecs) { + _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); + } else { + // Don't waste time on this worker until the registry has heard from it. + // &&& If it's been a really really long time, maybe delete this entry ??? + return; + } + break; } - break; } } + // _aMtx must not be held when calling this. + if (newlyDeadWorker) { + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << " worker " << wInfo_->wId << " appears to have died, reassigning its jobs."); + czar::Czar::getCzar()->killIncompleteUbjerJobsOn(wInfo_->wId); + } + shared_ptr jsWorkerReqPtr; { - lock_guard mapLg(_wqsData->_mapMtx); + lock_guard lg(_aMtx); //&&& needed ??? + lock_guard mapLg(_wqsData->mapMtx); // Check how many messages are currently being sent to the worker, if at the limit, return - if (_wqsData->_qIdDoneKeepFiles.empty() && _wqsData->_qIdDoneDeleteFiles.empty() && - _wqsData->_qIdDeadUberJobs.empty()) { + if (_wqsData->qIdDoneKeepFiles.empty() && _wqsData->qIdDoneDeleteFiles.empty() && + _wqsData->qIdDeadUberJobs.empty()) { return; } int tCount = _conThreadCount; @@ -141,14 +154,20 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti // &&& Maybe only send the status message if the lists are not empty ??? // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) // put this in a different function and start the thread.&&&; - _sendStatusMsg(jsWorkerReqPtr); + _sendStatusMsg(wInfo_, jsWorkerReqPtr); } -void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorkerReqPtr) { +void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, + std::shared_ptr const& jsWorkerReqPtr) { auto& jsWorkerReq = *jsWorkerReqPtr; auto const method = http::Method::POST; - auto const& wInf = _wqsData->_wInfo; - string const url = "http://" + wInf->wHost + ":" + to_string(wInf->wPort) + "/querystatus"; + //&&&auto const wInf = _wqsData->getWInfo(); + if (wInf == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " wInfo was null."); + return; + } + auto [ciwId, ciwHost, ciwManag, ciwPort] = wInf->getAll(); + string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/querystatus"; vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); @@ -163,7 +182,13 @@ void ActiveWorker::_sendStatusMsg(std::shared_ptr const& jsWorke try { json const response = client.readAsJson(); if (0 != response.at("success").get()) { - transmitSuccess = _wqsData->handleResponseJson(response); + bool startupTimeChanged = false; + tie(transmitSuccess, startupTimeChanged) = _wqsData->handleResponseJson(response); + if (startupTimeChanged) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " worker startupTime changed, likely rebooted."); + // kill all incomplete UberJobs on this worker. + czar::Czar::getCzar()->killIncompleteUbjerJobsOn(wInf->wId); + } } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " response success=0"); } @@ -182,6 +207,11 @@ void ActiveWorker::addToDoneKeepFiles(QueryId qId) { _wqsData->addToDoneKeepFile void ActiveWorker::removeDeadUberJobsFor(QueryId qId) { _wqsData->removeDeadUberJobsFor(qId); } +void ActiveWorker::addDeadUberJob(QueryId qId, UberJobId ujId) { + auto now = CLOCK::now(); + _wqsData->addDeadUberJob(qId, ujId, now); +} + string ActiveWorker::dump() const { lock_guard lg(_aMtx); return _dump(); @@ -214,32 +244,26 @@ void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, LOGS(_log, LOG_LVL_WARN, cName(__func__) << " worker contact info changed for " << wcKey << " new=" << wcVal->dump() << " old=" << aWorker->dump()); + // If there is existing information, only host and port values will change. aWorker->setWorkerContactInfo(wcVal); } } } } -/* &&& -void ActiveWorkerMap::pruneMap() { - lock_guard awLg(_awMapMtx); - for (auto iter = _awMap.begin(); iter != _awMap.end();) { - auto aWorker = iter->second; - if (aWorker->getWInfo()->timeSinceTouchSeconds() > _maxDeadTimeSeconds) { - iter = _awMap.erase(iter); - } else { - ++iter; - } - } -} -*/ - void ActiveWorkerMap::setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { _czarCancelAfterRestart = true; _czarCancelAfterRestartCzId = czId; _czarCancelAfterRestartQId = lastQId; } +ActiveWorker::Ptr ActiveWorkerMap::getActiveWorker(string const& workerId) const { + lock_guard lck(_awMapMtx); + auto iter = _awMap.find(workerId); + if (iter == _awMap.end()) return nullptr; + return iter->second; +} + void ActiveWorkerMap::sendActiveWorkersMessages() { // Send messages to each active worker as needed lock_guard lck(_awMapMtx); @@ -248,7 +272,6 @@ void ActiveWorkerMap::sendActiveWorkersMessages() { } } -/// &&& doc void ActiveWorkerMap::addToDoneDeleteFiles(QueryId qId) { lock_guard lck(_awMapMtx); for (auto const& [wName, awPtr] : _awMap) { @@ -257,7 +280,6 @@ void ActiveWorkerMap::addToDoneDeleteFiles(QueryId qId) { } } -/// &&& doc void ActiveWorkerMap::addToDoneKeepFiles(QueryId qId) { lock_guard lck(_awMapMtx); for (auto const& [wName, awPtr] : _awMap) { @@ -266,14 +288,4 @@ void ActiveWorkerMap::addToDoneKeepFiles(QueryId qId) { } } -/* &&& -/// &&& doc -void ActiveWorkerMap::removeDeadUberJobsFor(QueryId qId) { - lock_guard lck(_awMapMtx); - for (auto const& [wName, awPtr] : _awMap) { - awPtr->removeDeadUberJobsFor(qId); - } -} -*/ - } // namespace lsst::qserv::czar diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 0c05e0180..f02ca1a63 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -102,7 +102,7 @@ class ActiveWorker : public std::enable_shared_from_this { http::WorkerContactInfo::Ptr getWInfo() const { if (_wqsData == nullptr) return nullptr; - return _wqsData->_wInfo; + return _wqsData->getWInfo(); } ~ActiveWorker() = default; @@ -121,13 +121,15 @@ class ActiveWorker : public std::enable_shared_from_this { /// &&& doc void addToDoneKeepFiles(QueryId qId); + /// &&&doc + void addDeadUberJob(QueryId qId, UberJobId ujId); + /// &&& doc void removeDeadUberJobsFor(QueryId qId); std::string dump() const; private: - ///&&&ActiveWorker(WorkerContactInfo::Ptr const& wInfo) : _wInfo(wInfo) {} ActiveWorker(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey) : _wqsData(http::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, @@ -142,7 +144,8 @@ class ActiveWorker : public std::enable_shared_from_this { void _changeStateTo(State newState, double secsSinceUpdate, std::string const& note); /// &&& doc - void _sendStatusMsg(std::shared_ptr const& jsWorkerReqPtr); + void _sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, + std::shared_ptr const& jsWorkerReqPtr); /// &&& doc /// _aMtx must be held before calling. @@ -160,9 +163,11 @@ class ActiveWorker : public std::enable_shared_from_this { std::atomic _conThreadCount{0}; int _maxConThreadCount{2}; + /* &&& /// &&& doc /// @throws std::invalid_argument bool _parse(nlohmann::json const& jsWorkerReq); // &&& delete after basic testing + */ }; /// &&& doc @@ -188,6 +193,9 @@ class ActiveWorkerMap { /// should be cancelled. void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId); + /// &&& doc + ActiveWorker::Ptr getActiveWorker(std::string const& workerId) const; + // &&& doc void sendActiveWorkersMessages(); @@ -198,10 +206,9 @@ class ActiveWorkerMap { void addToDoneKeepFiles(QueryId qId); private: - std::map _awMap; - std::mutex _awMapMtx; ///< protects _awMap; + std::map _awMap; ///< Key is worker id. + mutable std::mutex _awMapMtx; ///< protects _awMap; - //&&&double const _maxDeadTimeSeconds = 60.0 * 15.0; ///< &&& set from config. double _timeoutAliveSecs = 60.0 * 5.0; ///< &&& set from config. 5min double _timeoutDeadSecs = 60.0 * 10.0; ///< &&& set from config. 10min double _maxLifetime = 60.0 * 60.0; ///< &&& set from config. 1hr diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index b9f35cb98..260e59998 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -67,7 +67,6 @@ #include "util/FileMonitor.h" #include "util/IterableFormatter.h" #include "util/String.h" -#include "xrdreq/QueryManagementAction.h" using namespace lsst::qserv; using namespace nlohmann; @@ -82,6 +81,7 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.Czar"); namespace lsst::qserv::czar { Czar::Ptr Czar::_czar; +uint64_t const Czar::czarStartupTime = millisecSinceEpoch(CLOCK::now()); Czar::Ptr Czar::createCzar(string const& configFilePath, string const& czarName) { _czar.reset(new Czar(configFilePath, czarName)); @@ -187,20 +187,6 @@ Czar::Czar(string const& configFilePath, string const& czarName) LOGS(_log, LOG_LVL_WARN, ex.what()); } } - /* &&& (moved this and czar crashed instantly, why?) - - if (_czarConfig->notifyWorkersOnCzarRestart()) { - try { - // &&&QM use http - Add flag to each worker in _activeWorkerMap - // TODO:UJ - Workers need to contact the registry and kill queries if the associated czar dies. - xrdreq::QueryManagementAction::notifyAllWorkers(_czarConfig->getXrootdFrontendUrl(), - proto::QueryManagement::CANCEL_AFTER_RESTART, - _czarConfig->id(), _lastQueryIdBeforeRestart()); - } catch (std::exception const& ex) { - LOGS(_log, LOG_LVL_WARN, ex.what()); - } - } - */ // This will block until there is a successful read of the database tables. _czarFamilyMap = CzarFamilyMap::create(_uqFactory->userQuerySharedResources()->queryMetadata); @@ -709,4 +695,22 @@ std::shared_ptr Czar::getExecutiveFromMap(QueryId qId) { return exec; } +void Czar::killIncompleteUbjerJobsOn(std::string const& restartedWorkerId) { + // Copy list of executives so the mutex isn't held forever. + std::map> execMap; + { + lock_guard lgMap(_executiveMapMtx); + execMap = _executiveMap; + } + + // For each executive, go through its list of uberjobs and cancel those jobs + // with workerId == restartedWorkerId && + for (auto const& [eKey, wPtrExec] : execMap) { + auto exec = wPtrExec.lock(); + if (exec != nullptr) { + exec->killIncompleteUberJobsOn(restartedWorkerId); + } + } +} + } // namespace lsst::qserv::czar diff --git a/src/czar/Czar.h b/src/czar/Czar.h index bf4131a2b..35a1088d7 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -38,6 +38,7 @@ #include "ccontrol/UserQuery.h" #include "ccontrol/UserQueryFactory.h" #include "czar/SubmitResult.h" +#include "global/clock_defs.h" #include "global/intTypes.h" #include "global/stringTypes.h" #include "mysql/MySqlConfig.h" @@ -144,6 +145,15 @@ class Czar { /// Get the executive associated with `qId`, this may be nullptr. std::shared_ptr getExecutiveFromMap(QueryId qId); + std::shared_ptr getActiveWorkerMap() const { return _activeWorkerMap; } + + /// &&& doc + void killIncompleteUbjerJobsOn(std::string const& workerId); + + /// Startup time of czar, sent to workers so they can detect that the czar was + /// was restarted when this value changes. + static uint64_t const czarStartupTime; + private: /// Private constructor for singleton. Czar(std::string const& configFilePath, std::string const& czarName); @@ -220,7 +230,7 @@ class Czar { /// Wait time between checks. TODO:UJ set from config std::chrono::milliseconds _monitorSleepTime{15000}; - std::unique_ptr _activeWorkerMap; + std::shared_ptr _activeWorkerMap; }; } // namespace lsst::qserv::czar diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 3e8607b54..7dd1e407a 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -302,8 +302,6 @@ bool CzarFamilyMap::_read() { // &&& TODO:UJ Before makeNewMaps(), get a list of workers considered to be alive by // czar::_activeWorkerMap - // give that list to makeNewMaps, and don't and workers to the maps that aren't on the - // list.&&& !!! // Make the new maps. shared_ptr familyMapPtr = makeNewMaps(qChunkMap); diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index c37b5da47..5ef8748d6 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -105,6 +105,7 @@ void CzarRegistry::_registryWorkerInfoLoop() { // Get worker information from the registry string const replicationInstanceId = _czarConfig->replicationInstanceId(); string const replicationAuthKey = _czarConfig->replicationAuthKey(); + uint64_t const czarStartTime = Czar::czarStartupTime; vector const headers; auto const method = http::Method::GET; @@ -127,7 +128,7 @@ void CzarRegistry::_registryWorkerInfoLoop() { { auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), - util::get_current_host_fqdn()); + util::get_current_host_fqdn(), czarStartTime); lock_guard lck(_mapMtx); if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { _contactMap = wMap; @@ -152,14 +153,11 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json for (auto const& [key, value] : jsWorkers.items()) { auto const& jsQserv = value.at("qserv"); LOGS(_log, LOG_LVL_DEBUG, __func__ << " key=" << key << " jsQ=" << jsQserv); - string wHost = jsQserv.at("host-addr").get(); - string wManagementHost = jsQserv.at("management-host-name").get(); - int wPort = jsQserv.at("management-port").get(); - uint64_t updateTimeInt = jsQserv.at("update-time-ms").get(); - TIMEPOINT updateTime = TIMEPOINT(chrono::milliseconds(updateTimeInt)); - auto wInfo = make_shared(key, wHost, wManagementHost, wPort, updateTime); - LOGS(_log, LOG_LVL_DEBUG, - __func__ << " wHost=" << wHost << " wPort=" << wPort << " updateTime=" << updateTimeInt); + + // The names for items here are different than the names used by workers. + auto wInfo = http::WorkerContactInfo::createFromJsonRegistry(key, jsQserv); + + LOGS(_log, LOG_LVL_DEBUG, __func__ << " wInfot=" << wInfo->dump()); auto iter = wMap->find(key); if (iter != wMap->end()) { LOGS(_log, LOG_LVL_ERROR, __func__ << " duplicate key " << key << " in " << response); @@ -196,6 +194,22 @@ bool CzarRegistry::_compareMapContactInfo(http::WorkerContactInfo::WCMap const& return true; } +http::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const { + http::WorkerContactInfo::WCMapPtr contMap = nullptr; + while (contMap == nullptr) { + { + std::lock_guard lockG(_mapMtx); + contMap = _contactMap; + } + if (contMap == nullptr) { + // This should only ever happen at startup if there's trouble getting data. + LOGS(_log, LOG_LVL_WARN, "waitForWorkerContactMap() _contactMap unavailable waiting for info"); + this_thread::sleep_for(1s); + } + } + return contMap; +} + void CzarRegistry::sendActiveWorkersMessages() { // Send messages to each active worker as needed lock_guard lck(_mapMtx); diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index c743c6001..302b5a3f0 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -69,11 +69,16 @@ class CzarRegistry { /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. - http::WorkerContactInfo::WCMapPtr getWorkerContactMap() { + http::WorkerContactInfo::WCMapPtr getWorkerContactMap() const { std::lock_guard lockG(_mapMtx); return _contactMap; } + /// Return _contactMap, the object that the returned pointer points to is + /// constant and no attempts should be made to change it. This + /// function will wait forever for a valid contact map to be ready. + http::WorkerContactInfo::WCMapPtr waitForWorkerContactMap() const; + /// &&& doc void sendActiveWorkersMessages(); @@ -111,7 +116,7 @@ class CzarRegistry { TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to ///< WorkerContactInfo update. // &&& review how this _mapMtx is used, probably locks for too long a period. - std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap + mutable std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap ActiveWorkerMap _activeWorkerMap; ///< Map of workers czar considers active. }; diff --git a/src/global/clock_defs.h b/src/global/clock_defs.h index 9db4dadbc..25d3b08bf 100644 --- a/src/global/clock_defs.h +++ b/src/global/clock_defs.h @@ -37,6 +37,10 @@ namespace lsst::qserv { using CLOCK = std::chrono::system_clock; using TIMEPOINT = std::chrono::time_point; +inline uint64_t millisecSinceEpoch(TIMEPOINT tm) { + return std::chrono::duration_cast(tm.time_since_epoch()).count(); +} + /// RAII class to help track a changing sum through a begin and end time. template class TimeCountTracker { diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index aed6bf73b..e9524a26e 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -49,16 +49,19 @@ json CzarContactInfo::serializeJson() const { jsCzar["id"] = czId; jsCzar["management-port"] = czPort; jsCzar["management-host-name"] = czHostName; + jsCzar["czar-startup-time"] = czStartupTime; return jsCzar; } -CzarContactInfo::Ptr CzarContactInfo::createJson(nlohmann::json const& czJson) { +CzarContactInfo::Ptr CzarContactInfo::createFromJson(nlohmann::json const& czJson) { try { auto czName_ = RequestBodyJSON::required(czJson, "name"); auto czId_ = RequestBodyJSON::required(czJson, "id"); auto czPort_ = RequestBodyJSON::required(czJson, "management-port"); auto czHostName_ = RequestBodyJSON::required(czJson, "management-host-name"); - return create(czName_, czId_, czPort_, czHostName_); + auto czStartupTime_ = RequestBodyJSON::required(czJson, "czar-startup-time"); + return create(czName_, czId_, czPort_, czHostName_, czStartupTime_); + //&&& return create(czName_, czId_, czPort_, czHostName_); } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("CzarContactInfo::createJson invalid ") << exc.what()); } @@ -67,30 +70,52 @@ CzarContactInfo::Ptr CzarContactInfo::createJson(nlohmann::json const& czJson) { std::string CzarContactInfo::dump() const { stringstream os; - os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << czHostName; + //&&& os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << + //czHostName; + os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << czHostName + << " czStartupTime=" << czStartupTime; return os.str(); } json WorkerContactInfo::serializeJson() const { + lock_guard lg(_rMtx); + return _serializeJson(); +} + +json WorkerContactInfo::_serializeJson() const { json jsWorker; jsWorker["id"] = wId; - jsWorker["host"] = wHost; - jsWorker["management-host-name"] = wManagementHost; - jsWorker["management-port"] = wPort; + jsWorker["host"] = _wHost; + jsWorker["management-host-name"] = _wManagementHost; + jsWorker["management-port"] = _wPort; + jsWorker["w-startup-time"] = _wStartupTime; return jsWorker; } -WorkerContactInfo::Ptr WorkerContactInfo::createJson(nlohmann::json const& wJson, TIMEPOINT updateTime_) { - LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& a"); +WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonRegistry(string const& wId_, + nlohmann::json const& regJson) { + try { + auto wHost_ = RequestBodyJSON::required(regJson, "host-addr"); + auto wManagementHost_ = RequestBodyJSON::required(regJson, "management-host-name"); + auto wPort_ = RequestBodyJSON::required(regJson, "management-port"); + auto updateTimeInt = RequestBodyJSON::required(regJson, "update-time-ms"); + TIMEPOINT updateTime_ = TIMEPOINT(chrono::milliseconds(updateTimeInt)); + + return create(wId_, wHost_, wManagementHost_, wPort_, updateTime_); + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("CWorkerContactInfo::createJson invalid ") << exc.what()); + } + return nullptr; +} + +WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonWorker(nlohmann::json const& wJson, + TIMEPOINT updateTime_) { try { auto wId_ = RequestBodyJSON::required(wJson, "id"); - LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& b"); auto wHost_ = RequestBodyJSON::required(wJson, "host"); - LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& c"); auto wManagementHost_ = RequestBodyJSON::required(wJson, "management-host-name"); - LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& d"); auto wPort_ = RequestBodyJSON::required(wJson, "management-port"); - LOGS(_log, LOG_LVL_ERROR, "WorkerContactInfo::createJson &&& e"); + return create(wId_, wHost_, wManagementHost_, wPort_, updateTime_); } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("CWorkerContactInfo::createJson invalid ") << exc.what()); @@ -99,33 +124,17 @@ WorkerContactInfo::Ptr WorkerContactInfo::createJson(nlohmann::json const& wJson } string WorkerContactInfo::dump() const { + lock_guard lg(_rMtx); + return _dump(); +} + +string WorkerContactInfo::_dump() const { stringstream os; os << "workerContactInfo{" - << "id=" << wId << " host=" << wHost << " mgHost=" << wManagementHost << " port=" << wPort << "}"; + << "id=" << wId << " host=" << _wHost << " mgHost=" << _wManagementHost << " port=" << _wPort << "}"; return os.str(); } -/* &&& -string ActiveWorker::getStateStr(State st) { - switch (st) { - case ALIVE: return string("ALIVE"); - case QUESTIONABLE: return string("QUESTIONABLE"); - case DEAD: return string("DEAD"); - } - return string("unknown"); -} - - -bool WorkerQueryStatusData::compareContactInfo(WorkerContactInfo const& wcInfo) const { - return _wInfo->isSameContactInfo(wcInfo); -} - -void WorkerQueryStatusData::setWorkerContactInfo(WorkerContactInfo::Ptr const& wcInfo) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " new info=" << wcInfo->dump()); - _wInfo = wcInfo; -} -*/ - shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a // message to send to the worker. @@ -136,14 +145,23 @@ shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { jsWorkerR["instance_id"] = _replicationInstanceId; jsWorkerR["auth_key"] = _replicationAuthKey; jsWorkerR["czar"] = _czInfo->serializeJson(); - jsWorkerR["worker"] = _wInfo->serializeJson(); + { + lock_guard lgI(_infoMtx); + if (_wInfo != nullptr) { + jsWorkerR["worker"] = _wInfo->serializeJson(); + } else { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " wInfo is null"); + } + } + // Note, old elements in the maps will be deleted after being added to the message + // to keep the czar from keeping track of these forever. addListsToJson(jsWorkerR, now, maxLifetime); - if (_czarCancelAfterRestart) { + if (czarCancelAfterRestart) { jsWorkerR["czarrestart"] = true; - lock_guard mapLg(_mapMtx); - jsWorkerR["czarrestartcancelczid"] = _czarCancelAfterRestartCzId; - jsWorkerR["czarrestartcancelqid"] = _czarCancelAfterRestartQId; + lock_guard mapLg(mapMtx); + jsWorkerR["czarrestartcancelczid"] = czarCancelAfterRestartCzId; + jsWorkerR["czarrestartcancelqid"] = czarCancelAfterRestartQId; } else { jsWorkerR["czarrestart"] = false; } @@ -151,21 +169,21 @@ shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { return jsWorkerReqPtr; } -void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxLifetime) { +void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tmMark, double maxLifetime) { jsWR["qiddonekeepfiles"] = json::array(); jsWR["qiddonedeletefiles"] = json::array(); jsWR["qiddeaduberjobs"] = json::array(); - lock_guard mapLg(_mapMtx); + lock_guard mapLg(mapMtx); { auto& jsDoneKeep = jsWR["qiddonekeepfiles"]; - auto iterDoneKeep = _qIdDoneKeepFiles.begin(); - while (iterDoneKeep != _qIdDoneKeepFiles.end()) { + auto iterDoneKeep = qIdDoneKeepFiles.begin(); + while (iterDoneKeep != qIdDoneKeepFiles.end()) { auto qId = iterDoneKeep->first; jsDoneKeep.push_back(qId); - auto tmStamp = iterDoneKeep->second; - double ageSecs = std::chrono::duration(tm - tmStamp).count(); + auto tmTouched = iterDoneKeep->second; + double ageSecs = std::chrono::duration(tmMark - tmTouched).count(); if (ageSecs > maxLifetime) { - iterDoneKeep = _qIdDoneKeepFiles.erase(iterDoneKeep); + iterDoneKeep = qIdDoneKeepFiles.erase(iterDoneKeep); } else { ++iterDoneKeep; } @@ -173,14 +191,14 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxL } { auto& jsDoneDelete = jsWR["qiddonedeletefiles"]; - auto iterDoneDelete = _qIdDoneDeleteFiles.begin(); - while (iterDoneDelete != _qIdDoneDeleteFiles.end()) { + auto iterDoneDelete = qIdDoneDeleteFiles.begin(); + while (iterDoneDelete != qIdDoneDeleteFiles.end()) { auto qId = iterDoneDelete->first; jsDoneDelete.push_back(qId); auto tmStamp = iterDoneDelete->second; - double ageSecs = std::chrono::duration(tm - tmStamp).count(); + double ageSecs = std::chrono::duration(tmMark - tmStamp).count(); if (ageSecs > maxLifetime) { - iterDoneDelete = _qIdDoneDeleteFiles.erase(iterDoneDelete); + iterDoneDelete = qIdDoneDeleteFiles.erase(iterDoneDelete); } else { ++iterDoneDelete; } @@ -188,9 +206,9 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxL } { auto& jsDeadUj = jsWR["qiddeaduberjobs"]; - auto iterDeadUjQid = _qIdDeadUberJobs.begin(); - while (iterDeadUjQid != _qIdDeadUberJobs.end()) { - TIMEPOINT oldestTm; // default is zero + auto iterDeadUjQid = qIdDeadUberJobs.begin(); + while (iterDeadUjQid != qIdDeadUberJobs.end()) { + TIMEPOINT youngestTm = TIMEPOINT::max(); // need to find the youngest auto qId = iterDeadUjQid->first; auto& ujIdMap = iterDeadUjQid->second; @@ -199,16 +217,17 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxL auto iterUjId = ujIdMap.begin(); bool addedUjId = false; + while (iterUjId != ujIdMap.end()) { UberJobId ujId = iterUjId->first; auto tmStamp = iterUjId->second; - if (tmStamp > oldestTm) { - oldestTm = tmStamp; + if (tmStamp < youngestTm) { + youngestTm = tmStamp; } jsUjIds.push_back(ujId); addedUjId = true; - double ageSecs = std::chrono::duration(tm - tmStamp).count(); + double ageSecs = std::chrono::duration(tmMark - tmStamp).count(); if (ageSecs > maxLifetime) { iterUjId = ujIdMap.erase(iterUjId); } else { @@ -220,8 +239,9 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxL jsDeadUj.push_back(jsQidUj); } - if (ujIdMap.empty() || std::chrono::duration(tm - oldestTm).count() > maxLifetime) { - iterDeadUjQid = _qIdDeadUberJobs.erase(iterDeadUjQid); + // If the youngest element was too old, delete the map. + if (ujIdMap.empty() || std::chrono::duration(tmMark - youngestTm).count() > maxLifetime) { + iterDeadUjQid = qIdDeadUberJobs.erase(iterDeadUjQid); } else { ++iterDeadUjQid; } @@ -229,10 +249,10 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tm, double maxL } } -WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json const& jsWorkerReq, - std::string const& replicationInstanceId, - std::string const& replicationAuthKey, - TIMEPOINT updateTm) { +WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json const& jsWorkerReq, + std::string const& replicationInstanceId_, + std::string const& replicationAuthKey_, + TIMEPOINT updateTm) { LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& a"); try { if (jsWorkerReq["version"] != http::MetaModule::version) { @@ -241,9 +261,9 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json cons } LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& b"); - auto czInfo_ = CzarContactInfo::createJson(jsWorkerReq["czar"]); + auto czInfo_ = CzarContactInfo::createFromJson(jsWorkerReq["czar"]); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& c"); - auto wInfo_ = WorkerContactInfo::createJson(jsWorkerReq["worker"], updateTm); + auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsWorkerReq["worker"], updateTm); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& d"); if (czInfo_ == nullptr || wInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, @@ -251,7 +271,7 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json cons << jsWorkerReq); } auto wqsData = - WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId, replicationAuthKey); + WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& e"); wqsData->parseLists(jsWorkerReq, updateTm); LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& end"); @@ -269,8 +289,8 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createJson(nlohmann::json cons } void WorkerQueryStatusData::parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm) { - lock_guard mapLg(_mapMtx); - parseListsInto(jsWR, updateTm, _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs); + lock_guard mapLg(mapMtx); + parseListsInto(jsWR, updateTm, qIdDoneKeepFiles, qIdDoneDeleteFiles, qIdDeadUberJobs); } void WorkerQueryStatusData::parseListsInto(nlohmann::json const& jsWR, TIMEPOINT updateTm, @@ -317,74 +337,98 @@ void WorkerQueryStatusData::parseListsInto(nlohmann::json const& jsWR, TIMEPOINT } void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm) { - auto& ujMap = _qIdDeadUberJobs[qId]; + lock_guard mapLg(mapMtx); + auto& ujMap = qIdDeadUberJobs[qId]; for (auto const ujId : ujIds) { ujMap[ujId] = tm; } } +void WorkerQueryStatusData::addDeadUberJob(QueryId qId, UberJobId ujId, TIMEPOINT tm) { + lock_guard mapLg(mapMtx); + auto& ujMap = qIdDeadUberJobs[qId]; + ujMap[ujId] = tm; +} + void WorkerQueryStatusData::addToDoneDeleteFiles(QueryId qId) { - lock_guard mapLg(_mapMtx); - _qIdDoneDeleteFiles[qId] = CLOCK::now(); + lock_guard mapLg(mapMtx); + qIdDoneDeleteFiles[qId] = CLOCK::now(); } void WorkerQueryStatusData::addToDoneKeepFiles(QueryId qId) { - lock_guard mapLg(_mapMtx); - _qIdDoneKeepFiles[qId] = CLOCK::now(); + lock_guard mapLg(mapMtx); + qIdDoneKeepFiles[qId] = CLOCK::now(); } void WorkerQueryStatusData::removeDeadUberJobsFor(QueryId qId) { - lock_guard mapLg(_mapMtx); - _qIdDeadUberJobs.erase(qId); + lock_guard mapLg(mapMtx); + qIdDeadUberJobs.erase(qId); } -json WorkerQueryStatusData::serializeResponseJson() { +json WorkerQueryStatusData::serializeResponseJson(uint64_t workerStartupTime) { // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a - // reponse. Nothing should be deleted and time is irrelevant for this, so maxLifetime is enormous - // and any time could be used, but now is easy. + // response. Nothing should be deleted and time is irrelevant for this, so maxLifetime is enormous + // and any time could be used for last contact, but now() is easy. + // This is only called by the worker. As such nothing should be deleted here as the lifetime of + // these elements is determined by the lifetime of the owning UserQueryInfo instance. double maxLifetime = std::numeric_limits::max(); auto now = CLOCK::now(); json jsResp = {{"success", 1}, {"errortype", "none"}, {"note", ""}}; + jsResp["w-startup-time"] = workerStartupTime; addListsToJson(jsResp, now, maxLifetime); return jsResp; } -bool WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { +std::pair WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { auto now = CLOCK::now(); std::map doneKeepF; std::map doneDeleteF; std::map> deadUberJobs; parseListsInto(jsResp, now, doneKeepF, doneDeleteF, deadUberJobs); - lock_guard mapLg(_mapMtx); + lock_guard mapLg(mapMtx); // Remove entries from _qIdDoneKeepFiles for (auto const& [qId, tm] : doneKeepF) { - _qIdDoneKeepFiles.erase(qId); + qIdDoneKeepFiles.erase(qId); } // Remove entries from _qIdDoneDeleteFiles for (auto const& [qId, tm] : doneDeleteF) { - _qIdDoneDeleteFiles.erase(qId); + qIdDoneDeleteFiles.erase(qId); } // Remove entries from _qIdDeadUberJobs for (auto const& [qId, ujMap] : deadUberJobs) { - auto iter = _qIdDeadUberJobs.find(qId); - if (iter != _qIdDeadUberJobs.end()) { + auto iter = qIdDeadUberJobs.find(qId); + if (iter != qIdDeadUberJobs.end()) { auto& deadMap = iter->second; for (auto const& [ujId, tm] : ujMap) { deadMap.erase(ujId); } if (deadMap.empty()) { - _qIdDeadUberJobs.erase(iter); + qIdDeadUberJobs.erase(iter); } } } - return true; + bool workerRestarted = false; + auto workerStartupTime = RequestBodyJSON::required(jsResp, "w-startup-time"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " workerStartupTime=" << workerStartupTime); + if (!_wInfo->checkWStartupTime(workerStartupTime)) { + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " startup time for worker=" << _wInfo->dump() + << " changed to=" << workerStartupTime << " Assuming worker restarted"); + workerRestarted = true; + } + return {true, workerRestarted}; } string WorkerQueryStatusData::dump() const { + lock_guard lgI(_infoMtx); + return _dump(); +} + +string WorkerQueryStatusData::_dump() const { stringstream os; os << "ActiveWorker " << ((_wInfo == nullptr) ? "?" : _wInfo->dump()); return os.str(); diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index 44282462c..21b3fe448 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -38,7 +38,7 @@ // This header declarations namespace lsst::qserv::http { -/// &&& doc +/// This class just contains the czar id and network contact information. class CzarContactInfo { public: using Ptr = std::shared_ptr; @@ -55,39 +55,34 @@ class CzarContactInfo { } static Ptr create(std::string const& czName_, CzarIdType czId_, int czPort_, - std::string const& czHostName_) { - return Ptr(new CzarContactInfo(czName_, czId_, czPort_, czHostName_)); + std::string const& czHostName_, uint64_t czStartupTime_) { + return Ptr(new CzarContactInfo(czName_, czId_, czPort_, czHostName_, czStartupTime_)); } - static Ptr createJson(nlohmann::json const& czarJson); + static Ptr createFromJson(nlohmann::json const& czarJson); std::string const czName; ///< czar "name" CzarIdType const czId; ///< czar "id" int const czPort; ///< czar "management-port" std::string const czHostName; ///< czar "management-host-name" + uint64_t const czStartupTime; ///< czar startup time /// &&& doc nlohmann::json serializeJson() const; - /// &&& doc - //&&&bool parse(nlohmann::json const& czarJson); - std::string dump() const; - /* &&& - auto& jsWCzar = jsWorkerR["czar"]; - jsWCzar["name"] = czarConfig->name(); - jsWCzar["id"]= czarConfig->id(); - jsWCzar["management-port"] = czarConfig->replicationHttpPort(); - jsWCzar["management-host-name"] = util::get_current_host_fqdn(); - */ + private: - CzarContactInfo(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_) - : czName(czName_), czId(czId_), czPort(czPort_), czHostName(czHostName_) {} + CzarContactInfo(std::string const& czName_, CzarIdType czId_, int czPort_, std::string const& czHostName_, + uint64_t czStartupTime_) + : czName(czName_), + czId(czId_), + czPort(czPort_), + czHostName(czHostName_), + czStartupTime(czStartupTime_) {} }; -/// &&& doc This class just contains the worker id and network communication -/// information, but it may be desirable to store connections to the -/// worker here as well. +/// This class just contains the worker id and network communication information. class WorkerContactInfo { public: using Ptr = std::shared_ptr; @@ -100,29 +95,61 @@ class WorkerContactInfo { return Ptr(new WorkerContactInfo(wId_, wHost_, wManagementHost_, wPort_, updateTime_)); } - /// &&& doc - static Ptr createJson(nlohmann::json const& workerJson, TIMEPOINT updateTime); + /// &&& doc Used to create WorkerQueryStatusData object from a registry json message. + static Ptr createFromJsonRegistry(std::string const& wId_, nlohmann::json const& regJson); + + /// &&& doc Used to create WorkerQueryStatusData object from a worker json message. + static Ptr createFromJsonWorker(nlohmann::json const& workerJson, TIMEPOINT updateTime); /// &&& doc nlohmann::json serializeJson() const; std::string cName(const char* fn) { return std::string("WorkerContactInfo::") + fn; } - /// &&& make private - WorkerContactInfo(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, - int wPort_, TIMEPOINT updateTime_) - : wId(wId_), wHost(wHost_), wManagementHost(wManagementHost_), wPort(wPort_) { - regUpdateTime(updateTime_); + // &&&QM &&&HERE should all of these be constant??? + std::string const wId; ///< key + //&&&std::string const wHost; ///< "host-addr" entry. + //&&&std::string const wManagementHost; ///< "management-host-name" entry. + //&&&int const wPort; ///< "management-port" entry. + + std::string getWHost() const { + std::lock_guard lg(_rMtx); + return _wHost; + } + + std::string getWManagementHost() const { + std::lock_guard lg(_rMtx); + return _wManagementHost; } - std::string const wId; ///< key - std::string const wHost; ///< "host-addr" entry. - std::string const wManagementHost; ///< "management-host-name" entry. - int const wPort; ///< "management-port" entry. - /// Return true if all members, aside from updateTime, are equal. + int getWPort() const { + std::lock_guard lg(_rMtx); + return _wPort; + } + + /// &&doc + void changeBaseInfo(WorkerContactInfo const& other) { + auto [oWId, oWHost, oWManagementHost, oWPort] = other.getAll(); + std::lock_guard lg(_rMtx); + _wHost = oWHost; + _wManagementHost = oWManagementHost; + _wPort = oWPort; + } + + /// @return wId - workerId + /// @return _wHost - worker host + /// @return _wManagementHost - management host + /// @return _wPort - worker port + std::tuple getAll() const { + std::lock_guard lg(_rMtx); + return {wId, _wHost, _wManagementHost, _wPort}; + } + + /// Return true if communication related items are the same. bool isSameContactInfo(WorkerContactInfo const& other) const { - return (wId == other.wId && wHost == other.wHost && wManagementHost == other.wManagementHost && - wPort == other.wPort); + auto [oWId, oWHost, oWManagementHost, oWPort] = other.getAll(); + std::lock_guard lg(_rMtx); + return (wId == oWId && _wHost == oWHost && _wManagementHost == oWManagementHost && _wPort == oWPort); } void regUpdateTime(TIMEPOINT updateTime) { @@ -141,18 +168,79 @@ class WorkerContactInfo { return _regUpdate; } + /* &&& + /// Sets _wStartupTime to startupTime, but only if _wStartupTime was 0. + /// @returns true if _wStartupTime was set. + bool setWStartupTime(uint64_t startupTime) { //&&& del if not used + std::lock_guard lg(_rMtx); + if (_wStartupTime == 0) { + _wStartupTime = startupTime; + return true; + } + return false; + } + */ + + /// @return true if startupTime equals _wStartupTime or _wStartupTime was never set, + /// if _wStartupTime was never set, it is set to startupTime. + /// @return false indicates the worker was restarted and all associated jobs need + /// re-assignment. + bool checkWStartupTime(uint64_t startupTime) { + std::lock_guard lg(_rMtx); + if (_wStartupTime == startupTime) { + return true; + } + if (_wStartupTime == 0) { + _wStartupTime = startupTime; + return true; + } + _wStartupTime = startupTime; + return false; + } + + uint64_t getWStartupTime() const { + std::lock_guard lg(_rMtx); + return _wStartupTime; + } + std::string dump() const; private: + WorkerContactInfo(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, + int wPort_, TIMEPOINT updateTime_) + : wId(wId_), _wHost(wHost_), _wManagementHost(wManagementHost_), _wPort(wPort_) { + regUpdateTime(updateTime_); + } + + // _rMtx must be locked before calling + std::string _dump() const; + + // _rMtx must be locked before calling + nlohmann::json _serializeJson() const; + + std::string _wHost; ///< "host-addr" entry. + std::string _wManagementHost; ///< "management-host-name" entry. + int _wPort; ///< "management-port" entry. + /// Last time the registry heard from this worker. The ActiveWorker class /// will use this to determine the worker's state. /// &&& Store in seconds since epoch to make atomic? TIMEPOINT _regUpdate; + /// "w-startup-time", it's value is set to zero until the real value is + /// received from the worker. Once it is non-zero, any change indicates + /// the worker was restarted and all UberJobs that were assigned there + /// need to be unassigned. On the worker, this should always be set from + /// foreman()->getStartupTime(); + uint64_t _wStartupTime = 0; + mutable std::mutex _rMtx; ///< protects _regUpdate }; -/// &&& doc +/// This classes purpose is to be a structure to store and transfer information +/// about which queries have been completed or cancelled on the worker. This +/// class contains the functions that encode and decode the data they contain +/// to and from a json format. class WorkerQueryStatusData { public: using Ptr = std::shared_ptr; @@ -161,23 +249,39 @@ class WorkerQueryStatusData { WorkerQueryStatusData(WorkerQueryStatusData const&) = delete; WorkerQueryStatusData& operator=(WorkerQueryStatusData const&) = delete; - std::string cName(const char* fName) { - return std::string("WorkerQueryStatusData::") + fName + " " + - ((_wInfo == nullptr) ? "?" : _wInfo->wId); - } + std::string cName(const char* fName) { return std::string("WorkerQueryStatusData::") + fName; } - static Ptr create(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) { - return Ptr(new WorkerQueryStatusData(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); + static Ptr create(WorkerContactInfo::Ptr const& wInfo_, CzarContactInfo::Ptr const& czInfo_, + std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) { + return Ptr(new WorkerQueryStatusData(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_)); } - /// &&& doc - static Ptr createJson(nlohmann::json const& czarJson, std::string const& replicationInstanceId, - std::string const& replicationAuthKey, TIMEPOINT updateTm); + /// &&& doc Used to create WorkerQueryStatusData object from a worker json message. + static Ptr createFromJson(nlohmann::json const& czarJson, std::string const& replicationInstanceId_, + std::string const& replicationAuthKey_, TIMEPOINT updateTm); ~WorkerQueryStatusData() = default; - WorkerContactInfo::Ptr getWInfo() const { return _wInfo; } + void setWInfo(WorkerContactInfo::Ptr const& wInfo_) { + std::lock_guard lgI(_infoMtx); + if (_wInfo == nullptr) { + _wInfo = wInfo_; + return; + } + if (wInfo_ != nullptr) { + // This only change host and port values of _wInfo. + _wInfo->changeBaseInfo(*wInfo_); + } + } + + WorkerContactInfo::Ptr getWInfo() const { + std::lock_guard lgI(_infoMtx); + return _wInfo; + } + CzarContactInfo::Ptr getCzInfo() const { return _czInfo; } + + /// doc &&& + void addDeadUberJob(QueryId qId, UberJobId ujId, TIMEPOINT tm); /// &&& doc void addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm); @@ -192,60 +296,37 @@ class WorkerQueryStatusData { void removeDeadUberJobsFor(QueryId qId); void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { - std::lock_guard mapLg(_mapMtx); - _czarCancelAfterRestart = true; - _czarCancelAfterRestartCzId = czId; - _czarCancelAfterRestartQId = lastQId; + std::lock_guard mapLg(mapMtx); + czarCancelAfterRestart = true; + czarCancelAfterRestartCzId = czId; + czarCancelAfterRestartQId = lastQId; } - bool isCzarRestart() const { return _czarCancelAfterRestart; } - CzarIdType getCzarRestartCzarId() const { return _czarCancelAfterRestartCzId; } - QueryId getCzarRestartQueryId() const { return _czarCancelAfterRestartQId; } - - std::string dump() const; - - //&&&private: // &&& Most of this needs to be made private again. - WorkerQueryStatusData(WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) - : _wInfo(wInfo), - _czInfo(czInfo), - _replicationInstanceId(replicationInstanceId), - _replicationAuthKey(replicationAuthKey) {} - - std::map _qIdDoneKeepFiles; ///< &&& doc - limit reached - std::map _qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished - std::map> _qIdDeadUberJobs; ///< &&& doc - std::atomic _czarCancelAfterRestart = false; - CzarIdType _czarCancelAfterRestartCzId = 0; - QueryId _czarCancelAfterRestartQId = 0; - - /// Protects _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs, - /// and czarCancelAfter variables. - mutable std::mutex _mapMtx; - - WorkerContactInfo::Ptr _wInfo; ///< &&& doc make const??? - CzarContactInfo::Ptr _czInfo; //< &&& doc make const??? - - std::string const _replicationInstanceId; ///< &&& doc - std::string const _replicationAuthKey; ///< &&& doc + bool isCzarRestart() const { return czarCancelAfterRestart; } + CzarIdType getCzarRestartCzarId() const { return czarCancelAfterRestartCzId; } + QueryId getCzarRestartQueryId() const { return czarCancelAfterRestartQId; } /// Create a json object held by a shared pointer to use as a message. /// Old objects in this instance will be removed after being added to the /// json message. std::shared_ptr serializeJson(double maxLifetime); - /// Add contents of qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs to `jsWR` - void addListsToJson(nlohmann::json& jsWR, TIMEPOINT tm, double maxLifetime); + /// Add contents of qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs to `jsWR`, + /// and remove map elements that have an age (tmMark - element.touchTime) greater + /// than maxLifetime. + void addListsToJson(nlohmann::json& jsWR, TIMEPOINT tmMark, double maxLifetime); /// &&& doc /// @throws std::invalid_argument void parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm); /// &&& doc - nlohmann::json serializeResponseJson(); + //&&&nlohmann::json serializeResponseJson(); + nlohmann::json serializeResponseJson(uint64_t workerStartupTime); /// &&& doc - bool handleResponseJson(nlohmann::json const& jsResp); + //&&&bool handleResponseJson(nlohmann::json const& jsResp); + std::pair handleResponseJson(nlohmann::json const& jsResp); /// &&& doc ///&&&void handleCzarRestart(); @@ -255,6 +336,38 @@ class WorkerQueryStatusData { std::map& doneKeepF, std::map& doneDeleteF, std::map>& deadUberJobs); + + std::string dump() const; + + // Making these private requires member functions to be written + // that cause issues with linking. All of the workarounds are ugly. + std::map qIdDoneKeepFiles; ///< &&& doc - limit reached + std::map qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished + std::map> qIdDeadUberJobs; ///< &&& doc + std::atomic czarCancelAfterRestart = false; + CzarIdType czarCancelAfterRestartCzId = 0; + QueryId czarCancelAfterRestartQId = 0; + /// Protects _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs, + /// and czarCancelAfter variables. + mutable std::mutex mapMtx; + +private: + WorkerQueryStatusData(WorkerContactInfo::Ptr const& wInfo_, CzarContactInfo::Ptr const& czInfo_, + std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) + : _wInfo(wInfo_), + _czInfo(czInfo_), + _replicationInstanceId(replicationInstanceId_), + _replicationAuthKey(replicationAuthKey_) {} + + WorkerContactInfo::Ptr _wInfo; ///< &&& doc + CzarContactInfo::Ptr const _czInfo; //< &&& doc + mutable std::mutex _infoMtx; ///< protects wInfo + + std::string const _replicationInstanceId; ///< &&& doc + std::string const _replicationAuthKey; ///< &&& doc + + /// _infoMtx must be locked before calling. + std::string _dump() const; }; } // namespace lsst::qserv::http diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index 191053631..d9f537711 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -47,34 +47,42 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { string const replicationInstanceId = "repliInstId"; string const replicationAuthKey = "repliIAuthKey"; + uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); + uint64_t wkrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 10s); + string const czrName("czar_name"); lsst::qserv::CzarIdType const czrId = 32; int czrPort = 2022; string const czrHost("cz_host"); - auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost); + //&&&auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost); + auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); LOGS_ERROR("&&& a czarA=" << czarA->dump()); auto czarAJs = czarA->serializeJson(); LOGS_ERROR("&&& b czarAJs=" << czarAJs); - auto czarB = lsst::qserv::http::CzarContactInfo::createJson(czarAJs); + auto czarB = lsst::qserv::http::CzarContactInfo::createFromJson(czarAJs); LOGS_ERROR("&&& c czarB=" << czarB); BOOST_REQUIRE(czarA->compare(*czarB)); - auto czarC = lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost); + //&&&auto czarC = lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost); + auto czarC = + lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost, cxrStartTime); BOOST_REQUIRE(!czarA->compare(*czarC)); auto start = lsst::qserv::CLOCK::now(); auto workerA = WorkerContactInfo::create("sd_workerA", "host_w1", "mgmhost_a", 3421, start); + auto workerB = WorkerContactInfo::create("sd_workerB", "host_w2", "mgmhost_a", 3421, start); auto workerC = WorkerContactInfo::create("sd_workerC", "host_w3", "mgmhost_b", 3422, start); + LOGS_ERROR("&&& d workerA=" << workerA->dump()); auto jsWorkerA = workerA->serializeJson(); LOGS_ERROR("&&& e jsWorkerA=" << jsWorkerA); auto start1Sec = start + 1s; - auto workerA1 = WorkerContactInfo::createJson(jsWorkerA, start1Sec); + auto workerA1 = WorkerContactInfo::createFromJsonWorker(jsWorkerA, start1Sec); LOGS_ERROR("&&& f workerA1=" << workerA1->dump()); BOOST_REQUIRE(workerA->isSameContactInfo(*workerA1)); @@ -90,16 +98,19 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { LOGS_ERROR("&&& h jsDataA=" << *jsDataA); // Check that empty lists work. - auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, - replicationAuthKey, start1Sec); + auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createFromJson(*jsDataA, replicationInstanceId, + replicationAuthKey, start1Sec); LOGS_ERROR("&&& i wqsdA1=" << wqsdA1->dump()); + LOGS_ERROR("&&& i wqsdA=" << wqsdA->dump()); auto jsDataA1 = wqsdA1->serializeJson(maxLifetime); + LOGS_ERROR("&&& i jsDataA1=" << *jsDataA1); + LOGS_ERROR("&&& i jsDataA=" << *jsDataA); BOOST_REQUIRE(*jsDataA == *jsDataA1); vector qIdsDelFiles = {7, 8, 9, 15, 25, 26, 27, 30}; vector qIdsKeepFiles = {1, 2, 3, 4, 6, 10, 13, 19, 33}; for (auto const qIdDF : qIdsDelFiles) { - wqsdA->_qIdDoneDeleteFiles[qIdDF] = start; + wqsdA->qIdDoneDeleteFiles[qIdDF] = start; } jsDataA = wqsdA->serializeJson(maxLifetime); @@ -107,7 +118,7 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { BOOST_REQUIRE(*jsDataA != *jsDataA1); for (auto const qIdKF : qIdsKeepFiles) { - wqsdA->_qIdDoneKeepFiles[qIdKF] = start; + wqsdA->qIdDoneKeepFiles[qIdKF] = start; } wqsdA->addDeadUberJobs(12, {1, 3}, start); @@ -118,7 +129,7 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { LOGS_ERROR("&&& j jsDataA=" << *jsDataA); auto start5Sec = start + 5s; - auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson( + auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); auto jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); @@ -131,25 +142,28 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { LOGS_ERROR("&&& k jsDataA=" << *jsDataA); BOOST_REQUIRE(*jsDataA != *jsWorkerAFromJson); - workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createJson(*jsDataA, replicationInstanceId, - replicationAuthKey, start5Sec); + workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( + *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); LOGS_ERROR("&&& l jsWorkerAFromJson=" << *jsWorkerAFromJson); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); // Make the response, which contains lists of the items handled by the workers. - auto jsWorkerResp = workerAFromJson->serializeResponseJson(); + auto jsWorkerResp = workerAFromJson->serializeResponseJson(wkrStartTime); // test removal of elements after response. - BOOST_REQUIRE(!wqsdA->_qIdDoneDeleteFiles.empty()); - BOOST_REQUIRE(!wqsdA->_qIdDoneKeepFiles.empty()); - BOOST_REQUIRE(!wqsdA->_qIdDeadUberJobs.empty()); + BOOST_REQUIRE(!wqsdA->qIdDoneDeleteFiles.empty()); + BOOST_REQUIRE(!wqsdA->qIdDoneKeepFiles.empty()); + BOOST_REQUIRE(!wqsdA->qIdDeadUberJobs.empty()); wqsdA->handleResponseJson(jsWorkerResp); + auto [respSuccess, workerRestarted] = wqsdA->handleResponseJson(jsWorkerResp); + BOOST_REQUIRE(respSuccess == true); + BOOST_REQUIRE(workerRestarted == false); - BOOST_REQUIRE(wqsdA->_qIdDoneDeleteFiles.empty()); - BOOST_REQUIRE(wqsdA->_qIdDoneKeepFiles.empty()); - BOOST_REQUIRE(wqsdA->_qIdDeadUberJobs.empty()); + BOOST_REQUIRE(wqsdA->qIdDoneDeleteFiles.empty()); + BOOST_REQUIRE(wqsdA->qIdDoneKeepFiles.empty()); + BOOST_REQUIRE(wqsdA->qIdDeadUberJobs.empty()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/proto/worker.proto b/src/proto/worker.proto index 0310420ed..e856a11dc 100644 --- a/src/proto/worker.proto +++ b/src/proto/worker.proto @@ -67,36 +67,3 @@ message ResponseData { required uint32 rowcount = 2; required uint64 transmitsize = 3; } - -///////////////////////////////////////////////////////////////// -// Protocol definition for the query management requests. These -// requests do not require any response messages to be explicitly -// sent by workers. -// -// ATTENTION: each message sent to a worker must be preceeded by -// an int32 size (network-byte-ordered) word carrying a size -// of the message. -//////////////////////////////////////////////////////////////// - -// &&& try to eliminate this -// The completion status to be sent back with responses to the query management requests. -message WorkerCommandStatus { - enum Code { - SUCCESS = 1; // The successful completion of a request. - ERROR = 2; // An error occurred during request execution. - } - optional Code code = 3 [default = SUCCESS]; - optional string error = 2 [default = ""]; // Optional error message (depends on the code) -} - -// &&&QM try to eliminate this -message QueryManagement { - enum Operation { - CANCEL_AFTER_RESTART = 1; // Cancel older queries before the specified query (excluding that one). - CANCEL = 2; // Cancel a specific query. - COMPLETE = 3; // Notify workers on the completion of the specified query. - } - required Operation op = 1; - required uint64 czar_id = 3; - required uint64 query_id = 2; -} diff --git a/src/qana/CMakeLists.txt b/src/qana/CMakeLists.txt index c9df3d8ad..0a9a320e0 100644 --- a/src/qana/CMakeLists.txt +++ b/src/qana/CMakeLists.txt @@ -36,7 +36,6 @@ FUNCTION(qana_tests) qserv_css qserv_meta rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/qdisp/CMakeLists.txt b/src/qdisp/CMakeLists.txt index 38daf54c1..2bc919dd4 100644 --- a/src/qdisp/CMakeLists.txt +++ b/src/qdisp/CMakeLists.txt @@ -40,7 +40,6 @@ target_link_libraries(testQDisp qserv_meta query rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 3d5463a88..b9a48145c 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -294,6 +294,27 @@ void Executive::addUberJobs(std::vector> const& uJobsTo } } +void Executive::killIncompleteUberJobsOn(std::string const& restartedWorkerId) { + // Work with a copy to reduce lock time. + std::map> ujobsMap; + { + lock_guard lck(_uberJobsMapMtx); + ujobsMap = _uberJobsMap; + } + for (auto&& [ujKey, uj] : ujobsMap) { + if (uj == nullptr) continue; + auto wContactInfo = uj->getWorkerContactInfo(); + if (wContactInfo->wId == restartedWorkerId) { + if (uj->getStatus()->getState() != qmeta::JobStatus::COMPLETE) { + // All jobs in the uberjob will be set as unassigned, which + // will lead to Czar::_monitor() reassigning them to new + // UberJobs. (Unless this query was cancelled.) + uj->killUberJob(); + } + } + } +} + string Executive::dumpUberJobCounts() const { stringstream os; os << "exec=" << getIdStr(); diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index cdbb967f6..b954c3313 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -163,6 +163,7 @@ class Executive : public std::enable_shared_from_this { std::string const& getIdStr() const { return _idStr; } void setScanInteractive(bool interactive) { _scanInteractive = interactive; } + bool getScanInteractive() const { return _scanInteractive; } /// @return number of jobs in flight. int getNumInflight() const; @@ -223,6 +224,9 @@ class Executive : public std::enable_shared_from_this { /// @param deleteResults - If true, delete all result files for this query on the workers. void sendWorkerCancelMsg(bool deleteResults); + /// &&& doc + void killIncompleteUberJobsOn(std::string const& restartedWorkerId); + private: Executive(ExecutiveConfig const& c, std::shared_ptr const& ms, SharedResources::Ptr const& sharedResources, std::shared_ptr const& qStatus, @@ -292,7 +296,8 @@ class Executive : public std::enable_shared_from_this { std::chrono::seconds _secondsBetweenQMetaUpdates{60}; std::mutex _lastQMetaMtx; ///< protects _lastQMetaUpdate. - bool _scanInteractive = false; ///< true for interactive scans. + /// true for interactive scans, once set it doesn't change. + bool _scanInteractive = false; // Add a job to the _chunkToJobMap // TODO:UJ This may need review as large changes were made to this part of the code. @@ -330,7 +335,7 @@ class Executive : public std::enable_shared_from_this { std::atomic _readyToExecute{false}; }; -/// TODO:UJ delete - MarkCompleteFunc is not needed with uberjobs. +/// TODO:UJ delete - MarkCompleteFunc is not needed with uberjobs. //&&&QM class MarkCompleteFunc { public: typedef std::shared_ptr Ptr; diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index a77476daa..88ac1fa98 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -60,7 +60,7 @@ class JobBase : public std::enable_shared_from_this { virtual std::string const& getIdStr() const = 0; virtual std::shared_ptr getQdispPool() = 0; //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for - //xrootd + // xrootd virtual std::shared_ptr getRespHandler() = 0; virtual std::shared_ptr getStatus() = 0; virtual bool getScanInteractive() const = 0; diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 9440380a5..7da2079b0 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -31,6 +31,7 @@ #include "nlohmann/json.hpp" // Qserv headers +#include "czar/Czar.h" #include "cconfig/CzarConfig.h" #include "global/LogContext.h" #include "http/Client.h" @@ -106,14 +107,15 @@ bool UberJob::runUberJob() { // Send the uberjob to the worker auto const method = http::Method::POST; - string const url = "http://" + _wContactInfo->wHost + ":" + to_string(_wContactInfo->wPort) + "/queryjob"; + auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); + string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/queryjob"; vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); // See xrdsvc::httpWorkerCzarModule::_handleQueryJob for json message parsing. json request = {{"version", http::MetaModule::version}, {"instance_id", czarConfig->replicationInstanceId()}, {"auth_key", czarConfig->replicationAuthKey()}, - {"worker", _wContactInfo->wId}, + {"worker", ciwId}, {"czar", {{"name", czarConfig->name()}, {"id", czarConfig->id()}, @@ -212,6 +214,15 @@ bool UberJob::isQueryCancelled() { return exec->getCancelled(); } +bool UberJob::getScanInteractive() const { + auto exec = _executive.lock(); + if (exec == nullptr) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " _executive == nullptr"); + return false; // Safer to assume the worst. + } + return exec->getScanInteractive(); +} + bool UberJob::_setStatusIfOk(qmeta::JobStatus::State newState, string const& msg) { // must be locked _jobsMtx auto currentState = _jobStatus->getState(); @@ -454,6 +465,37 @@ nlohmann::json UberJob::_workerErrorFinish(bool deleteData, std::string const& e return jsRet; } +void UberJob::killUberJob() { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " stopping this UberJob and re-assigning jobs."); + + auto exec = _executive.lock(); + if (exec == nullptr || isQueryCancelled()) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " no executive or cancelled"); + return; + } + + if (exec->isLimitRowComplete()) { + int dataIgnored = exec->incrDataIgnoredCount(); + if ((dataIgnored - 1) % 1000 == 0) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " ignoring, enough rows already."); + } + return; + } + + // Put this UberJob on the list of UberJobs that the worker should drop. + auto activeWorkerMap = czar::Czar::getCzar()->getActiveWorkerMap(); + auto activeWorker = activeWorkerMap->getActiveWorker(_wContactInfo->wId); + if (activeWorker != nullptr) { + activeWorker->addDeadUberJob(_queryId, _uberJobId); + } + + _unassignJobs(); + // Let Czar::_monitor reassign jobs - other UberJobs are probably being killed + // so waiting probably gets a better distribution. If this is deemed to slow, + // then exec->assignJobsToUberJobs() could be called here. + return; +} + std::ostream& UberJob::dumpOS(std::ostream& os) const { os << "(jobs sz=" << _jobs.size() << "("; lock_guard lockJobsMtx(_jobsMtx); diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index b29baf266..6dc56f0a5 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -57,10 +57,13 @@ class UberJob : public JobBase { virtual ~UberJob() {}; + std::string cName(const char* funcN) const { return std::string("UberJob::") + funcN + " " + getIdStr(); } + bool addJob(std::shared_ptr const& job); bool runUberJob(); - std::string cName(const char* funcN) const { return std::string("UberJob::") + funcN + " " + getIdStr(); } + /// &&&doc + void killUberJob(); QueryId getQueryId() const override { return _queryId; } UberJobId getJobId() const override { @@ -68,13 +71,12 @@ class UberJob : public JobBase { } // TODO:UJ change name when JobBase no longer needed. std::string const& getIdStr() const override { return _idStr; } std::shared_ptr getQdispPool() override { return _qdispPool; } - //&&&std::string const& getPayload() const override { return _payload; } // TODO:UJ delete when possible. std::shared_ptr getRespHandler() override { return _respHandler; } std::shared_ptr getStatus() override { return _jobStatus; - } // TODO:UJ relocate to JobBase - bool getScanInteractive() const override { return false; } ///< UberJobs are never interactive. - bool isQueryCancelled() override; // TODO:UJ relocate to JobBase + } // TODO:UJ relocate to JobBase + bool getScanInteractive() const override; ///< probably not called TODO:UJ + bool isQueryCancelled() override; // TODO:UJ relocate to JobBase void callMarkCompleteFunc(bool success) override; ///< call markComplete for all jobs in this UberJob. std::shared_ptr getExecutive() override { return _executive.lock(); } @@ -95,11 +97,12 @@ class UberJob : public JobBase { /// Set the worker information needed to send messages to the worker believed to /// be responsible for the chunks handled in this UberJob. - void setWorkerContactInfo( - http::WorkerContactInfo::Ptr const& wContactInfo) { // Change to ActiveWorker &&& ??? + void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wContactInfo) { _wContactInfo = wContactInfo; } + http::WorkerContactInfo::Ptr getWorkerContactInfo() { return _wContactInfo; } + /// Get the data for the worker that should handle this UberJob. czar::CzarChunkMap::WorkerChunksData::Ptr getWorkerData() { return _workerData; } diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index ce3a4069d..22934d587 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -338,7 +338,7 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { LOGS_DEBUG("ExecutiveCancel: squash it test"); SetupTest tEnv("respdata"); //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before - //squash + // squash SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->squash(); @@ -357,7 +357,7 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { LOGS_DEBUG("ExecutiveCancel: squash 20 test"); SetupTest tEnv("respdata"); //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before - //squash + // squash SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 20); tEnv.ex->squash(); diff --git a/src/qproc/CMakeLists.txt b/src/qproc/CMakeLists.txt index a27ad4db9..db311c4ab 100644 --- a/src/qproc/CMakeLists.txt +++ b/src/qproc/CMakeLists.txt @@ -32,7 +32,6 @@ FUNCTION(qproc_tests) qserv_css qserv_meta rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/query/CMakeLists.txt b/src/query/CMakeLists.txt index 6fcfbbb33..196a47efb 100644 --- a/src/query/CMakeLists.txt +++ b/src/query/CMakeLists.txt @@ -60,7 +60,6 @@ FUNCTION(query_tests) qserv_meta query rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/replica/CMakeLists.txt b/src/replica/CMakeLists.txt index 776c7bff9..1b994290b 100644 --- a/src/replica/CMakeLists.txt +++ b/src/replica/CMakeLists.txt @@ -25,7 +25,6 @@ target_link_libraries(replica PUBLIC replica_util replica_worker qserv_css - xrdreq xrdsvc XrdCl XrdSsiLib diff --git a/src/rproc/CMakeLists.txt b/src/rproc/CMakeLists.txt index 4c96284c9..13705b9ff 100644 --- a/src/rproc/CMakeLists.txt +++ b/src/rproc/CMakeLists.txt @@ -30,7 +30,6 @@ FUNCTION(rproc_tests) qserv_css qserv_meta rproc - xrdreq Boost::unit_test_framework Threads::Threads ) diff --git a/src/wbase/CMakeLists.txt b/src/wbase/CMakeLists.txt index ae1fd984a..205ebad5b 100644 --- a/src/wbase/CMakeLists.txt +++ b/src/wbase/CMakeLists.txt @@ -8,7 +8,6 @@ target_sources(wbase PRIVATE Task.cc UberJobData.cc UserQueryInfo.cc - WorkerCommand.cc ) target_include_directories(wbase PRIVATE diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index cf8c06fc3..a7ede98fd 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -46,7 +46,6 @@ #include "util/ResultFileNameParser.h" #include "util/Timer.h" #include "util/TimeUtils.h" -#include "xrdsvc/StreamBuffer.h" // LSST headers #include "lsst/log/Log.h" @@ -608,6 +607,7 @@ bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ // Prepare the response object and serialize in into a message that will // be sent to Czar. if (!_useHttp) { +#if 0 //&&& proto::ResponseSummary response; response.set_wname(_workerId); response.set_queryid(queryId); @@ -653,6 +653,7 @@ bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ _kill(streamMutexLock, "sendData"); return false; } +#endif //&&& } else { string httpFileUrl = task->resultFileHttpUrl(); _uberJobData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); diff --git a/src/wbase/MsgProcessor.h b/src/wbase/MsgProcessor.h deleted file mode 100644 index 4f875f93e..000000000 --- a/src/wbase/MsgProcessor.h +++ /dev/null @@ -1,66 +0,0 @@ - -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2011-2016 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -/// MsgProcessor.h -#ifndef LSST_QSERV_WBASE_MSG_PROCESSOR_H -#define LSST_QSERV_WBASE_MSG_PROCESSOR_H - -// System headers -#include -#include - -// Third party headers -#include "nlohmann/json.hpp" - -// Forward declarations -namespace lsst::qserv::wbase { -class Task; -struct TaskSelector; -class WorkerCommand; -} // namespace lsst::qserv::wbase - -namespace lsst::qserv::wbase { - -/// MsgProcessor implementations handle incoming Task objects. -struct MsgProcessor { // &&& delete file if possible - virtual ~MsgProcessor() {} - - /// Process a group of query processing tasks. - virtual void processTasks(std::vector> const& tasks) = 0; // &&& delete - - /// Process a managememt command - virtual void processCommand( - std::shared_ptr const& command) = 0; // &&& can this be deleted - - /** - * Retreive the status of queries being processed by the worker. - * @param taskSelector Task selection criterias. - * @return a JSON representation of the object's status for the monitoring - */ - virtual nlohmann::json statusToJson( - wbase::TaskSelector const& taskSelector) = 0; // &&& can this be deleted -}; - -} // namespace lsst::qserv::wbase - -#endif // LSST_QSERV_WBASE_MSG_PROCESSOR_H diff --git a/src/wbase/SendChannel.cc b/src/wbase/SendChannel.cc index 21e459ee8..c07dd37f6 100644 --- a/src/wbase/SendChannel.cc +++ b/src/wbase/SendChannel.cc @@ -41,7 +41,6 @@ #include "global/LogContext.h" #include "util/common.h" #include "util/Timer.h" -#include "xrdsvc/SsiRequest.h" namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.SendChannel"); @@ -61,19 +60,9 @@ class NopChannel : public SendChannel { cout << "NopChannel send(" << (void*)buf << ", " << bufLen << ");\n"; return !isDead(); } - - bool sendError(string const& msg, int code) override { - if (kill("NopChannel")) return false; - cout << "NopChannel sendError(\"" << msg << "\", " << code << ");\n"; - return true; - } - bool sendStream(xrdsvc::StreamBuffer::Ptr const& sBuf, bool last) override { - cout << "NopChannel sendStream(" << (void*)sBuf.get() << ", " << (last ? "true" : "false") << ");\n"; - return !isDead(); - } }; -SendChannel::Ptr SendChannel::newNopChannel() { return make_shared(); } +SendChannel::Ptr SendChannel::newNopChannel() { return std::shared_ptr(new NopChannel()); } /// StringChannel is an almost-trivial implementation of a SendChannel that /// remembers what it has received. @@ -87,46 +76,12 @@ class StringChannel : public SendChannel { return true; } - bool sendError(string const& msg, int code) override { - if (kill("StringChannel")) return false; - ostringstream os; - os << "(" << code << "," << msg << ")"; - _dest.append(os.str()); - return true; - } - - bool sendStream(xrdsvc::StreamBuffer::Ptr const& sBuf, bool last) override { - if (isDead()) return false; - char const* buf = sBuf->data; - size_t bufLen = sBuf->getSize(); - _dest.append(buf, bufLen); - cout << "StringChannel sendStream(" << (void*)buf << ", " << bufLen << ", " - << (last ? "true" : "false") << ");\n"; - return true; - } - private: string& _dest; }; -SendChannel::Ptr SendChannel::newStringChannel(string& d) { return make_shared(d); } - -/// This is the standard definition of SendChannel which actually does something! -/// We vector responses posted to SendChannel via the tightly bound SsiRequest -/// object as this object knows how to effect Ssi responses. -/// -bool SendChannel::send(char const* buf, int bufLen) { - if (isDead()) return false; - if (_ssiRequest->reply(buf, bufLen)) return true; - kill("SendChannel::send"); - return false; -} - -bool SendChannel::sendError(string const& msg, int code) { - // Kill this send channel. If it wasn't already dead, send the error. - if (kill("SendChannel::sendError")) return false; - if (_ssiRequest->replyError(msg.c_str(), code)) return true; - return false; +SendChannel::Ptr SendChannel::newStringChannel(string& d) { + return std::shared_ptr(new StringChannel(d)); } bool SendChannel::kill(std::string const& note) { @@ -139,36 +94,7 @@ bool SendChannel::kill(std::string const& note) { bool SendChannel::isDead() { if (_dead) return true; - if (_ssiRequest == nullptr) return true; - if (_ssiRequest->isFinished()) kill("SendChannel::isDead"); return _dead; } -bool SendChannel::sendStream(xrdsvc::StreamBuffer::Ptr const& sBuf, bool last) { - if (isDead()) return false; - if (_ssiRequest->replyStream(sBuf, last)) return true; - LOGS(_log, LOG_LVL_ERROR, "_ssiRequest->replyStream failed, killing."); - kill("SendChannel::sendStream"); - return false; -} - -bool SendChannel::sendData(char const* buf, int bufLen) { - if (isDead()) return false; - if (_ssiRequest->reply(buf, bufLen)) return true; - LOGS(_log, LOG_LVL_ERROR, "_ssiRequest->reply failed, killing."); - kill("SendChannel::sendData"); - return false; -} - -bool SendChannel::setMetadata(const char* buf, int blen) { - if (isDead()) return false; - if (_ssiRequest->sendMetadata(buf, blen)) return true; - return false; -} - -uint64_t SendChannel::getSeq() const { - if (_ssiRequest == nullptr) return 0; - return _ssiRequest->getSeq(); -} - } // namespace lsst::qserv::wbase diff --git a/src/wbase/SendChannel.h b/src/wbase/SendChannel.h index 0753e0aef..de4724955 100644 --- a/src/wbase/SendChannel.h +++ b/src/wbase/SendChannel.h @@ -23,18 +23,12 @@ #define LSST_QSERV_WBASE_SENDCHANNEL_H // System headers +#include #include #include #include -// Qserv headers -#include "xrdsvc/StreamBuffer.h" - -namespace lsst::qserv { -namespace xrdsvc { -class SsiRequest; // Forward declaration -} -namespace wbase { +namespace lsst::qserv { namespace wbase { /// SendChannel objects abstract an byte-output mechanism. Provides a layer of /// abstraction to reduce coupling to the XrdSsi API. SendChannel generally @@ -44,35 +38,13 @@ class SendChannel { using Ptr = std::shared_ptr; using Size = long long; - SendChannel(std::shared_ptr const& s) : _ssiRequest(s) {} SendChannel() {} // Strictly for non-Request versions of this object. virtual ~SendChannel() {} - /// ****************************************************************** /// The following methods are used to send responses back to a request. - /// The "send" calls may vector the response via the tightly bound - /// SsiRequest object (the constructor default) or use some other - /// mechanism (see newNopChannel and newStringChannel). - /// - virtual bool send(char const* buf, int bufLen); - virtual bool sendError(std::string const& msg, int code); - - /// Send a bucket of bytes. - /// @param last true if no more sendStream calls will be invoked. - virtual bool sendStream(xrdsvc::StreamBuffer::Ptr const& sBuf, bool last); - - /// Send the data. - virtual bool sendData(char const* buf, int bufLen); - - /// - /// ****************************************************************** - - /// Set a function to be called when a resources from a deferred send* - /// operation may be released. This allows a caller to be - /// notified when the file descriptor may be closed and perhaps reclaimed. - void setReleaseFunc(std::function const& r) { _release = r; } - void release() { _release(); } + /// (see newNopChannel and newStringChannel). + virtual bool send(char const* buf, int bufLen) = 0; //&&& delete /// Construct a new NopChannel that ignores everything it is asked to send static SendChannel::Ptr newNopChannel(); @@ -81,10 +53,6 @@ class SendChannel { /// provided by reference at construction. static SendChannel::Ptr newStringChannel(std::string& dest); - /// @return true if metadata was set. - /// buff must remain valid until the transmit is complete. - bool setMetadata(const char* buf, int blen); - /// Kill this SendChannel /// @ return the previous value of _dead bool kill(std::string const& note); @@ -95,17 +63,10 @@ class SendChannel { /// Set just before destorying this object to prevent pointless error messages. void setDestroying() { _destroying = true; } - uint64_t getSeq() const; - -protected: - std::function _release = []() { ; }; ///< Function to release resources. - private: - std::shared_ptr _ssiRequest; std::atomic _dead{false}; ///< True if there were any failures using this SendChanel. std::atomic _destroying{false}; }; -} // namespace wbase -} // namespace lsst::qserv +}} // namespace lsst::qserv::wbase #endif // LSST_QSERV_WBASE_SENDCHANNEL_H diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index b3c4f8818..6acfd97f6 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -364,25 +364,17 @@ void Task::action(util::CmdData* data) { // 'task' contains statistics that are still useful. However, the resources used // by sendChannel need to be freed quickly. LOGS(_log, LOG_LVL_DEBUG, __func__ << " calling resetSendChannel() for " << tIdStr); - resetSendChannel(); // Frees its xrdsvc::SsiRequest object. + resetSendChannel(); // Frees the SendChannel instance } string Task::getQueryString() const { - //&&&string qs = _userQueryInfo->getTemplate(_templateId); auto qStats = _queryStats.lock(); if (qStats == nullptr) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _queryStats could not be locked"); return string(""); } - // auto uQInfo = _userQueryInfo.lock(); auto uQInfo = qStats->getUserQueryInfo(); - /* &&& - if (uQInfo == nullptr) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _userQueryInfo could not be locked"); - return string(""); - } - */ string qs = uQInfo->getTemplate(_templateId); LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& a qs=" << qs); boost::algorithm::replace_all(qs, CHUNK_TAG, to_string(_chunkId)); @@ -391,8 +383,6 @@ string Task::getQueryString() const { return qs; } -//&&&void Task::setQueryStatistics(wpublish::QueryStatistics::Ptr const& qStats) { _queryStats = qStats; } - wpublish::QueryStatistics::Ptr Task::getQueryStats() const { auto qStats = _queryStats.lock(); if (qStats == nullptr) { @@ -600,23 +590,4 @@ ostream& operator<<(ostream& os, Task const& t) { return os; } -ostream& operator<<(ostream& os, IdSet const& idSet) { - // Limiting output as number of entries can be very large. - int maxDisp = idSet.maxDisp; // only affects the amount of data printed. - lock_guard lock(idSet.mx); - os << "showing " << maxDisp << " of count=" << idSet._ids.size() << " "; - bool first = true; - int i = 0; - for (auto id : idSet._ids) { - if (!first) { - os << ", "; - } else { - first = false; - } - os << id; - if (++i >= maxDisp) break; - } - return os; -} - } // namespace lsst::qserv::wbase diff --git a/src/wbase/Task.h b/src/wbase/Task.h index fa519732e..9580040d8 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -111,28 +111,6 @@ class TaskScheduler { util::HistogramRolling::Ptr histTimeOfTransmittingTasks; ///< Store information about transmitting tasks. }; -/// Used to find tasks that are in process for debugging with Task::_idStr. -/// This is largely meant to track down incomplete tasks in a possible intermittent -/// failure and should probably be removed when it is no longer needed. -/// It depends on code in BlendScheduler to work. If the decision is made to keep it -/// forever, dependency on BlendScheduler needs to be re-worked. -struct IdSet { // TODO:UJ delete if possible - void add(std::string const& id) { - std::lock_guard lock(mx); - _ids.insert(id); - } - void remove(std::string const& id) { - std::lock_guard lock(mx); - _ids.erase(id); - } - std::atomic maxDisp{5}; //< maximum number of entries to show with operator<< - friend std::ostream& operator<<(std::ostream& os, IdSet const& idSet); - -private: - std::set _ids; - mutable std::mutex mx; -}; - /// class Task defines a query task to be done, containing a TaskMsg /// (over-the-wire) additional concrete info related to physical /// execution conditions. @@ -175,15 +153,6 @@ class Task : public util::CommandForThreadPool { // Hopefully, many are the same for all tasks and can be moved to ujData and userQueryInfo. // Candidates: scanInfo, maxTableSizeMb, FileChannelShared, resultsHttpPort. // Unfortunately, this will be much easier if it is done after xrootd method is removed. - /* &&& - Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, - int fragmentNumber, std::shared_ptr const& userQueryInfo, size_t templateId, - bool hasSubchunks, int subchunkId, std::string const& db, proto::ScanInfo const& scanInfo, - bool scanInteractive, int maxTableSizeMb, std::vector const& fragSubTables, - std::vector const& fragSubchunkIds, std::shared_ptr const& sc, - std::shared_ptr const& queryStats_, - uint16_t resultsHttpPort = 8080); - */ Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, std::string const& db, proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSizeMb, @@ -205,8 +174,6 @@ class Task : public util::CommandForThreadPool { std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort = 8080); - //&&&void setQueryStatistics(std::shared_ptr const& qC); - std::shared_ptr getSendChannel() const { return _sendChannel; } void resetSendChannel() { _sendChannel.reset(); } ///< reset the shared pointer for FileChannelShared std::string user; ///< Incoming username @@ -218,19 +185,18 @@ class Task : public util::CommandForThreadPool { void action(util::CmdData* data) override; /// Cancel the query in progress and set _cancelled. - /// Query cancellation on the worker is fairly complicated. This - /// function usually called by `SsiRequest::Finished` when xrootd - /// indicates the job is cancelled. This may come from: - /// - xrootd - in the case of communications issues + /// Query cancellation on the worker is fairly complicated. + /// This may come from: /// - czar - user query was cancelled, an error, or limit reached. /// This function may also be called by `Task::checkCancelled()` - `_sendChannel` - /// has been killed, usually a result of failed communication with xrootd. + /// has been killed, usually a result of failed czar communication. /// If a `QueryRunner` object for this task exists, it must /// be cancelled to free up threads and other resources. /// Otherwise `_cancelled` is set so that an attempt /// to run this `Task` will result in a rapid exit. /// This functional also attempts to inform the scheduler for this - /// `Task` that is has been cancelled (scheduler currently does nothing in this case). + /// `Task` that is has been cancelled. The scheduler currently does + /// nothing in this case. void cancel(); /// Check if this task should be cancelled and call cancel() as needed. @@ -273,8 +239,6 @@ class Task : public util::CommandForThreadPool { bool getSafeToMoveRunning() { return _safeToMoveRunning; } void setSafeToMoveRunning(bool val) { _safeToMoveRunning = val; } ///< For testing only. - static IdSet allIds; // set of all task jobId numbers that are not complete. - /// @return true if qId and jId match this task's query and job ids. bool idsMatch(QueryId qId, int jId, uint64_t tseq) const { return (_qId == qId && _jId == jId && tseq == _tSeq); diff --git a/src/wbase/WorkerCommand.cc b/src/wbase/WorkerCommand.cc deleted file mode 100644 index cf79089a9..000000000 --- a/src/wbase/WorkerCommand.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2012-2018 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "wbase/WorkerCommand.h" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "wbase/SendChannel.h" - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.WorkerCommand"); - -} // namespace - -namespace lsst::qserv::wbase { - -WorkerCommand::WorkerCommand(SendChannel::Ptr const& sendChannel) - : util::Command([this](util::CmdData* data) { this->run(); }), _sendChannel(sendChannel) {} - -void WorkerCommand::sendSerializedResponse() { - std::string str(_frameBuf.data(), _frameBuf.size()); - _sendChannel->sendStream(xrdsvc::StreamBuffer::createWithMove(str), true); -} - -} // namespace lsst::qserv::wbase diff --git a/src/wbase/WorkerCommand.h b/src/wbase/WorkerCommand.h deleted file mode 100644 index c0934f479..000000000 --- a/src/wbase/WorkerCommand.h +++ /dev/null @@ -1,96 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2011-2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -/// WorkerCommand.h -#ifndef LSST_QSERV_WBASE_WORKER_COMMAND_H -#define LSST_QSERV_WBASE_WORKER_COMMAND_H - -// System headers -#include -#include -#include -#include - -// Qserv headers -#include "proto/FrameBuffer.h" -#include "proto/worker.pb.h" -#include "util/Command.h" - -// Forward declarations -namespace lsst::qserv::wbase { -class SendChannel; -} // namespace lsst::qserv::wbase - -namespace lsst::qserv::wbase { - -/** - * Class WorkerCommand is the base class for a family of various worker - * management commmands. - */ -class WorkerCommand : public util::Command { -public: - using Ptr = std::shared_ptr; - - WorkerCommand& operator=(const WorkerCommand&) = delete; - WorkerCommand(const WorkerCommand&) = delete; - WorkerCommand() = delete; - virtual ~WorkerCommand() = default; - - /// @param sendChannel - communication channel for reporting results - explicit WorkerCommand(std::shared_ptr const& sendChannel); - -protected: - /// The actual behavior is provided by subclasses. - virtual void run() = 0; - - /** - * Fill in the status code and the message into the response message - * of the desired type and sent it back to a caller. - * @param error Mandatory error to be reported. - * @param code The optional error code if the one differes from the default one. - * @param extendedModsFunc The optional function to be provided if any additional modifications - * are required to be made to the response object. - */ - template - void reportError(std::string const& error, - proto::WorkerCommandStatus::Code code = proto::WorkerCommandStatus::ERROR, - std::function const& extendedModsFunc = nullptr) { - RESPONSE resp; - resp.mutable_status()->set_code(code); - resp.mutable_status()->set_error(error); - if (extendedModsFunc != nullptr) extendedModsFunc(resp); - _frameBuf.serialize(resp); - sendSerializedResponse(); - } - - /** - * Send the serialized payload stored within the frame buffer to a caller. - */ - void sendSerializedResponse(); - - std::shared_ptr _sendChannel; ///< For result reporting - proto::FrameBuffer _frameBuf; ///< Buffer for serializing a response -}; - -} // namespace lsst::qserv::wbase - -#endif // LSST_QSERV_WBASE_WORKER_COMMAND_H diff --git a/src/wconfig/WorkerConfig.h b/src/wconfig/WorkerConfig.h index 36c723fa3..9b6d682b5 100644 --- a/src/wconfig/WorkerConfig.h +++ b/src/wconfig/WorkerConfig.h @@ -210,8 +210,10 @@ class WorkerConfig { return _ReservedInteractiveSqlConnections->getVal(); } + /* &&& /// @return the maximum number of gigabytes that can be used by StreamBuffers - unsigned int getBufferMaxTotalGB() const { return _bufferMaxTotalGB->getVal(); } + unsigned int getBufferMaxTotalGB() const { return _bufferMaxTotalGB->getVal(); } //&&& delete + */ /// @return the maximum number of concurrent transmits to a czar unsigned int getMaxTransmits() const { return _maxTransmits->getVal(); } @@ -362,8 +364,10 @@ class WorkerConfig { util::ConfigValTUInt::create(_configValMap, "sqlconnections", "maxsqlconn", notReq, 800); CVTUIntPtr _ReservedInteractiveSqlConnections = util::ConfigValTUInt::create( _configValMap, "sqlconnections", "reservedinteractivesqlconn", notReq, 50); + /* &&& CVTUIntPtr _bufferMaxTotalGB = util::ConfigValTUInt::create(_configValMap, "transmit", "buffermaxtotalgb", notReq, 41); + */ CVTUIntPtr _maxTransmits = util::ConfigValTUInt::create(_configValMap, "transmit", "maxtransmits", notReq, 40); CVTIntPtr _maxPerQid = util::ConfigValTInt::create(_configValMap, "transmit", "maxperqid", notReq, 3); diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index 288ed67e8..653c40be3 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -39,7 +39,6 @@ #include "qhttp/Response.h" #include "qhttp/Server.h" #include "qhttp/Status.h" -#include "wbase/WorkerCommand.h" #include "wconfig/WorkerConfig.h" #include "wcontrol/ResourceMonitor.h" #include "wcontrol/SqlConnMgr.h" @@ -157,10 +156,6 @@ void Foreman::processTasks(vector const& tasks) { _scheduler->queCmd(cmds); } -void Foreman::processCommand(shared_ptr const& command) { - _workerCommandQueue->queCmd(command); -} - uint16_t Foreman::httpPort() const { return _httpServer->getPort(); } nlohmann::json Foreman::statusToJson(wbase::TaskSelector const& taskSelector) { diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index 5045cfe96..7ba1c47e3 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -40,7 +40,6 @@ #include "util/EventThread.h" #include "util/HoldTrack.h" #include "wbase/Base.h" -#include "wbase/MsgProcessor.h" #include "wbase/Task.h" // Forward declarations @@ -96,8 +95,9 @@ class Scheduler : public wbase::TaskScheduler, public util::CommandQueue { /// Foreman is used to maintain a thread pool and schedule Tasks for the thread pool. /// It also manages sub-chunk tables with the ChunkResourceMgr. /// The schedulers may limit the number of threads they will use from the thread pool. -class Foreman : public wbase::MsgProcessor { +class Foreman { public: + using Ptr = std::shared_ptr; /** * @param scheduler - pointer to the scheduler * @param poolSize - size of the thread pool @@ -111,7 +111,8 @@ class Foreman : public wbase::MsgProcessor { std::shared_ptr const& chunkInventory, std::shared_ptr const& sqlConnMgr); - virtual ~Foreman() override; + //&&& virtual ~Foreman() override; + ~Foreman(); // This class doesn't have the default construction or copy semantics Foreman() = delete; @@ -128,21 +129,21 @@ class Foreman : public wbase::MsgProcessor { uint16_t httpPort() const; /// Process a group of query processing tasks. - /// @see MsgProcessor::processTasks() - void processTasks(std::vector> const& tasks) override; // &&& delete - - /// Implement the corresponding method of the base class - /// @see MsgProcessor::processCommand() - void processCommand(std::shared_ptr const& command) override; // &&& delete + void processTasks(std::vector> const& tasks); /// &&& doc std::shared_ptr addQueryId(QueryId qId); /// Implement the corresponding method of the base class - /// @see MsgProcessor::statusToJson() - virtual nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector) override; + nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector); + + uint64_t getWorkerStartupTime() const { return _workerStartupTime; } private: + /// Startup time of worker, sent to czars so they can detect that the worker was + /// was restarted when this value changes. + uint64_t const _workerStartupTime = millisecSinceEpoch(CLOCK::now()); + std::shared_ptr _chunkResourceMgr; util::ThreadPool::Ptr _pool; diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index eb76be91b..35501c76f 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -69,7 +69,6 @@ #include "wcontrol/SqlConnMgr.h" #include "wdb/ChunkResource.h" #include "wpublish/QueriesAndChunks.h" -#include "xrdsvc/StreamBuffer.h" namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.wdb.QueryRunner"); @@ -132,18 +131,6 @@ void QueryRunner::_setDb() { } } -size_t QueryRunner::_getDesiredLimit() { - double percent = xrdsvc::StreamBuffer::percentOfMaxTotalBytesUsed(); - size_t minLimit = 1'000'000; - size_t maxLimit = proto::ProtoHeaderWrap::PROTOBUFFER_DESIRED_LIMIT; - if (percent < 0.1) return maxLimit; - double reduce = 1.0 - (percent + 0.2); // force minLimit when 80% of memory used. - if (reduce < 0.0) reduce = 0.0; - size_t lim = maxLimit * reduce; - if (lim < minLimit) lim = minLimit; - return lim; -} - util::TimerHistogram memWaitHisto("memWait Hist", {1, 5, 10, 20, 40}); bool QueryRunner::runQuery() { @@ -362,6 +349,7 @@ void QueryRunner::cancel() { } } + /* &&& auto streamB = _streamBuf.lock(); if (streamB != nullptr) { streamB->cancel(); @@ -369,6 +357,7 @@ void QueryRunner::cancel() { // The send channel will die naturally on its own when xrootd stops talking to it // or other tasks call _transmitCancelledError(). + */ } QueryRunner::~QueryRunner() {} diff --git a/src/wdb/QueryRunner.h b/src/wdb/QueryRunner.h index b59b0e47b..785496772 100644 --- a/src/wdb/QueryRunner.h +++ b/src/wdb/QueryRunner.h @@ -45,10 +45,6 @@ #include "wbase/Task.h" #include "wdb/ChunkResource.h" -namespace lsst::qserv::xrdsvc { -class StreamBuffer; -} // namespace lsst::qserv::xrdsvc - namespace lsst::qserv::wcontrol { class SqlConnMgr; } // namespace lsst::qserv::wcontrol @@ -79,8 +75,8 @@ class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_fro /// by Task::cancel(), so if this needs to be cancelled elsewhere, /// call Task::cancel(). /// This should kill an in progress SQL command. - /// It also tries to unblock `_streamBuf` to keep the thread - /// from being blocked forever. + //&&&/// It also tries to unblock `_streamBuf` to keep the thread + //&&&/// from being blocked forever. void cancel() override; protected: @@ -97,7 +93,7 @@ class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_fro bool _dispatchChannel(); MYSQL_RES* _primeResult(std::string const& query); ///< Obtain a result handle for a query. - static size_t _getDesiredLimit(); + //&&&static size_t _getDesiredLimit(); wbase::Task::Ptr const _task; ///< Actual task @@ -107,7 +103,6 @@ class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_fro ChunkResourceMgr::Ptr _chunkResourceMgr; std::string _dbName; std::atomic _cancelled{false}; - std::weak_ptr _streamBuf; ///< used release condition variable on cancel. std::atomic _removedFromThreadPool{false}; mysql::MySqlConfig const _mySqlConfig; std::unique_ptr _mysqlConn; diff --git a/src/xrdreq/CMakeLists.txt b/src/xrdreq/CMakeLists.txt deleted file mode 100644 index 14974da04..000000000 --- a/src/xrdreq/CMakeLists.txt +++ /dev/null @@ -1,45 +0,0 @@ -add_library(xrdreq OBJECT) -add_dependencies(xrdreq proto) - -target_sources(xrdreq PRIVATE - QservRequest.cc - QueryManagementAction.cc - QueryManagementRequest.cc -) - -target_include_directories(xrdreq PRIVATE - ${XROOTD_INCLUDE_DIRS} -) - -target_link_libraries(xrdreq PUBLIC - log - proto - protobuf - XrdSsiLib - XrdCl -) - -FUNCTION(XRDREQ_UTILS) - FOREACH(UTIL IN ITEMS ${ARGV}) - add_executable(${UTIL}) - target_sources(${UTIL} PRIVATE ${UTIL}.cc) - target_include_directories(${UTIL} PRIVATE ${XROOTD_INCLUDE_DIRS}) - target_link_libraries(${UTIL} PRIVATE - crypto - pthread - proto - util - global - xrdreq - ) - install(TARGETS ${UTIL}) - ENDFOREACH() -ENDFUNCTION() - -xrdreq_utils( - qserv-query-management -) - -install( - TARGETS xrdreq -) diff --git a/src/xrdreq/QservRequest.cc b/src/xrdreq/QservRequest.cc deleted file mode 100644 index 6310d1c09..000000000 --- a/src/xrdreq/QservRequest.cc +++ /dev/null @@ -1,216 +0,0 @@ -/* - * LSST Data Management System - * Copyright 2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "xrdreq/QservRequest.h" - -// System headers -#include -#include - -// Qserv headers -#include "lsst/log/Log.h" - -using namespace std; - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdreq.QservRequest"); - -// Set this parameter to some reasonable default -int const bufInitialSize = 1024; - -} // namespace - -namespace lsst::qserv::xrdreq { - -atomic QservRequest::_numClassInstances(0); - -QservRequest::~QservRequest() { - delete[] _buf; - - --_numClassInstances; - LOGS(_log, LOG_LVL_TRACE, "QservRequest destructed instances: " << _numClassInstances); -} - -QservRequest::QservRequest() - : _bufIncrementSize(bufInitialSize), - _bufSize(0), - _bufCapacity(bufInitialSize), - _buf(new char[bufInitialSize]) { - // This report is used solely for debugging purposes to allow tracking - // potential memory leaks within applications. - ++_numClassInstances; - LOGS(_log, LOG_LVL_TRACE, "QservRequest constructed instances: " << _numClassInstances); -} - -void QservRequest::cancel() { - // This will decrement the reference counter to the pointee at the end of the current - // block regardless of any exceptions that may be thrown below. - auto self = move(_refToSelf4keepAlive); - Finished(true); -} - -void QservRequest::setRefToSelf4keepAlive(shared_ptr ptr) { - if ((ptr == nullptr) || (this != ptr.get())) { - stringstream ss; - ss << "QservRequest::" << __func__ << ": the value of " << ptr - << " passed as an argument is not pointing to the current object."; - throw invalid_argument(ss.str()); - } - _refToSelf4keepAlive = ptr; -} - -char* QservRequest::GetRequest(int& dlen) { - // Ask a subclass to serialize its request into the frame buffer - onRequest(_frameBuf); - - // Tell SSI which data and how many bytes to send - dlen = _frameBuf.size(); - return _frameBuf.data(); -} - -bool QservRequest::ProcessResponse(const XrdSsiErrInfo& eInfo, const XrdSsiRespInfo& rInfo) { - string const context = "QservRequest::" + string(__func__) + " "; - - if (eInfo.hasError()) { - // This will decrement the reference counter to the pointee at the end of the current - // block regardless of any exceptions that may be thrown below. - auto self = move(_refToSelf4keepAlive); - - // Copy the argument before sending the upstream notification - // Otherwise the current object may get disposed before we even had - // a chance to notify XRootD/SSI by calling Finished(). - string const errorStr = rInfo.eMsg; - - LOGS(_log, LOG_LVL_ERROR, context << "** FAILED **, error: " << errorStr); - - // Tell XrootD to release all resources associated with this request - Finished(); - - // Notify a subclass on the abnormal condition - // WARNING: This has to be the last call as the object may get deleted - // downstream. - onError(errorStr); - return false; - } - LOGS(_log, LOG_LVL_TRACE, - context << " eInfo.rType: " << rInfo.rType << "(" << rInfo.State() << ")" - << ", eInfo.blen: " << rInfo.blen); - - switch (rInfo.rType) { - case XrdSsiRespInfo::isData: - case XrdSsiRespInfo::isStream: - - LOGS(_log, LOG_LVL_TRACE, context << "** REQUESTING RESPONSE DATA **"); - GetResponseData(_buf + _bufSize, _bufIncrementSize); - return true; - - default: - // This will decrement the reference counter to the pointee at the end of the current - // block regardless of any exceptions that may be thrown below. - auto self = move(_refToSelf4keepAlive); - - // Copy the argument before sending the upstream notification - // Otherwise the current object may get disposed before we even had - // a chance to notify XRootD/SSI by calling Finished(). - string const responseType = to_string(rInfo.rType); - - // Tell XrootD to release all resources associated with this request - Finished(); - - // Notify a subclass on the abnormal condition - // WARNING: This has to be the last call as the object may get deleted - // downstream. - onError("QservRequest::ProcessResponse ** ERROR ** unexpected response type: " + responseType); - return false; - } -} - -void QservRequest::ProcessResponseData(const XrdSsiErrInfo& eInfo, char* buff, int blen, bool last) { - string const context = "QservRequest::" + string(__func__) + " "; - - LOGS(_log, LOG_LVL_TRACE, context << "eInfo.isOK: " << eInfo.isOK()); - - if (not eInfo.isOK()) { - // This will decrement the reference counter to the pointee at the end of the current - // block regardless of any exceptions that may be thrown below. - auto self = move(_refToSelf4keepAlive); - - // Copy these arguments before sending the upstream notification. - // Otherwise the current object may get disposed before we even had - // a chance to notify XRootD/SSI by calling Finished(). - - string const errorStr = eInfo.Get(); - int const errorNum = eInfo.GetArg(); - - LOGS(_log, LOG_LVL_ERROR, - context << "** FAILED ** eInfo.Get(): " << errorStr << ", eInfo.GetArg(): " << errorNum); - - // Tell XrootD to realease all resources associated with this request - Finished(); - - // Notify a subclass on the ubnormal condition. - // WARNING: This has to be the last call as the object may get deleted - // downstream. - onError(errorStr); - - } else { - LOGS(_log, LOG_LVL_TRACE, context << "blen: " << blen << ", last: " << last); - - // Update the byte counter - _bufSize += blen; - - if (last) { - // This will decrement the reference counter to the pointee at the end of the current - // block regardless of any exceptions that may be thrown below. - auto self = move(_refToSelf4keepAlive); - - // Tell XrootD to release all resources associated with this request - Finished(); - - // Ask a subclass to process the response - // WARNING: This has to be the last call as the object may get deleted - // downstream. - proto::FrameBufferView view(_buf, _bufSize); - onResponse(view); - - } else { - // Double the buffer's capacity and copy over its previous content into the new location - int prevBufCapacity = _bufCapacity; - _bufIncrementSize = prevBufCapacity; - _bufCapacity += _bufIncrementSize; - - char* prevBuf = _buf; - _buf = new char[_bufCapacity]; - - copy(prevBuf, prevBuf + prevBufCapacity, _buf); - - delete[] prevBuf; - - // Keep reading - GetResponseData(_buf + _bufSize, _bufIncrementSize); - } - } -} - -} // namespace lsst::qserv::xrdreq diff --git a/src/xrdreq/QservRequest.h b/src/xrdreq/QservRequest.h deleted file mode 100644 index 4306d9131..000000000 --- a/src/xrdreq/QservRequest.h +++ /dev/null @@ -1,120 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2011-2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDREQ_QSERV_REQUEST_H -#define LSST_QSERV_XRDREQ_QSERV_REQUEST_H - -// System headers -#include -#include -#include - -// Third party headers -#include "XrdSsi/XrdSsiRequest.hh" - -// Qserv headers -#include "proto/FrameBuffer.h" -#include "proto/worker.pb.h" - -namespace lsst::qserv::xrdreq { - -/** - * Class QservRequest is a base class for a family of the client-side requests - * (classes) to Qserv workers. - */ -class QservRequest : public XrdSsiRequest { -public: - QservRequest(QservRequest const&) = delete; - QservRequest& operator=(QservRequest const&) = delete; - virtual ~QservRequest() override; - - /** - * Do a proper request cancellation to ensure a pointer to the request gets deleted - * after calling XrdSsiRequest::Finished(true). - */ - void cancel(); - -protected: - QservRequest(); - - /** - * Setting a pointer to the object would guarantee that the life expectancy - * of the request be preserved before it's finished/failed and the corresponding - * notifications are sent to a subclass via the virtual methods QservRequest::onResponse() - * or QservRequest::onError(). The pointer will be reset after calling either of - * these methods, or the method QservRequest::cancel(). - * @param ptr The pointer to be set. - * @throws std::invalid_argument if the pointer is empty or pointing to a different - * request object. - */ - void setRefToSelf4keepAlive(std::shared_ptr ptr); - - /** - * Serialize a request into the provided buffer. The method is required to be - * provided by a subclass. - * @param buf A request buffer for serializing a request. - */ - virtual void onRequest(proto::FrameBuffer& buf) = 0; - - /** - * Process response from Qserv. The method is required to be provided by a subclass. - * @param view The buffer view for parsing results. - */ - virtual void onResponse(proto::FrameBufferView& view) = 0; - - /** - * Notify a base class about a failure occurred when sending a request data - * or receiving a response. - * @param error A message explaining a reason of the failure. - */ - virtual void onError(std::string const& msg) = 0; - - char* GetRequest(int& dlen) override; - bool ProcessResponse(const XrdSsiErrInfo& eInfo, const XrdSsiRespInfo& rInfo) override; - void ProcessResponseData(const XrdSsiErrInfo& eInfo, char* buff, int blen, bool last) override; - -private: - /// The global counter for the number of instances of any subclasses - static std::atomic _numClassInstances; - - /// Request buffer is prepared by subclasses before sending a request to a worker. - proto::FrameBuffer _frameBuf; - - // Response buffer is updated when receiving a response stream of data from a worker. - - /// The (very first and the) last increment of the capacity of the incoming - /// buffer is used to limit the amount of bytes to be received from a server. - int _bufIncrementSize; - - int _bufSize; ///< actual (meaningful) number of bytes in the incoming buffer - int _bufCapacity; ///< total capacity of the incoming buffer - - char* _buf; ///< buffer for incomming data - - /// The reference to the object is needed to guarantee the life expectency of - /// the request object while the request is still being processed. - std::shared_ptr _refToSelf4keepAlive; -}; - -} // namespace lsst::qserv::xrdreq - -#endif // LSST_QSERV_XRDREQ_QSERV_REQUEST_H \ No newline at end of file diff --git a/src/xrdreq/QueryManagementAction.cc b/src/xrdreq/QueryManagementAction.cc deleted file mode 100644 index f63a013b1..000000000 --- a/src/xrdreq/QueryManagementAction.cc +++ /dev/null @@ -1,137 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "xrdreq/QueryManagementAction.h" - -// System headers -#include - -// Third party headers -#include "XrdCl/XrdClFile.hh" -#include "XrdCl/XrdClXRootDResponses.hh" -#include "XrdSsi/XrdSsiProvider.hh" -#include "XrdSsi/XrdSsiService.hh" - -// Qserv headers -#include "xrdreq/QueryManagementRequest.h" - -// LSST headers -#include "lsst/log/Log.h" - -/// This C++ symbol is provided by the SSI shared library -extern XrdSsiProvider* XrdSsiProviderClient; - -using namespace std; - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdreq.QueryManagementAction"); - -string xrootdStatus2str(XrdCl::XRootDStatus const& s) { - return "status=" + to_string(s.status) + ", code=" + to_string(s.code) + ", errNo=" + to_string(s.errNo) + - ", message='" + s.GetErrorMessage() + "'"; -} - -/// The RAII wrapper around the silly C pointer to facilitate proper deletion -/// of the object returned by the XROOTD API. -struct LocationInfoRAII { - XrdCl::LocationInfo* locationInfo = nullptr; - ~LocationInfoRAII() { delete locationInfo; } -}; - -} // namespace - -namespace lsst::qserv::xrdreq { - -void QueryManagementAction::notifyAllWorkers(string const& xrootdFrontendUrl, - proto::QueryManagement::Operation op, uint32_t czarId, - QueryId queryId, CallbackType onFinish) { - auto const ptr = shared_ptr(new QueryManagementAction()); - ptr->_notifyAllWorkers(xrootdFrontendUrl, op, czarId, queryId, onFinish); -} - -QueryManagementAction::QueryManagementAction() { - LOGS(_log, LOG_LVL_TRACE, "QueryManagementAction ** CONSTRUCTED **"); -} - -QueryManagementAction::~QueryManagementAction() { - LOGS(_log, LOG_LVL_TRACE, "QueryManagementAction ** DELETED **"); -} - -void QueryManagementAction::_notifyAllWorkers(std::string const& xrootdFrontendUrl, - proto::QueryManagement::Operation op, uint32_t czarId, - QueryId queryId, CallbackType onFinish) { - string const context = "QueryManagementAction::" + string(__func__) + " "; - - // Find all subscribers (worker XROOTD servers) serving this special resource. - // Throw an exception if no workers are registered. - ::LocationInfoRAII locationInfoHandler; - string const queryResourceName = "/query"; - XrdCl::FileSystem fileSystem(xrootdFrontendUrl); - XrdCl::XRootDStatus const status = fileSystem.Locate(queryResourceName, XrdCl::OpenFlags::Flags::None, - locationInfoHandler.locationInfo); - if (!status.IsOK()) { - throw runtime_error(context + "failed to locate subscribers for resource " + queryResourceName + - ", " + ::xrootdStatus2str(status)); - } - if (uint32_t const numLocations = locationInfoHandler.locationInfo->GetSize(); numLocations == 0) { - throw runtime_error(context + "no subscribers are serving resource " + queryResourceName); - } else { - // Fill worker addresses as keys into the response object. - for (uint32_t i = 0; i < numLocations; ++i) { - _response[locationInfoHandler.locationInfo->At(i).GetAddress()] = string(); - } - } - - // Send a request to each worker. Note capturing a copy of 'self' to ensure - // the curent object will still existr while the requests will be being processed. - auto const self = shared_from_this(); - for (auto itr : _response) { - string const workerAddress = itr.first; - - // Connect to the worker service - XrdSsiErrInfo errInfo; - XrdSsiService* serviceProvider = XrdSsiProviderClient->GetService(errInfo, workerAddress); - if (nullptr == serviceProvider) { - throw runtime_error(context + " failed to contact worker service " + workerAddress + - ", error: " + errInfo.Get()); - } - - // Make and configure the request object - auto request = xrdreq::QueryManagementRequest::create( - op, czarId, queryId, - [self, workerAddress, onFinish](proto::WorkerCommandStatus::Code code, string const& error) { - if (code != proto::WorkerCommandStatus::SUCCESS) { - self->_response[workerAddress] = error; - } - if (++(self->_numWorkerRequestsFinished) == self->_response.size()) { - if (onFinish != nullptr) onFinish(self->_response); - } - }); - - // Initiate request processing - XrdSsiResource resource(queryResourceName); - serviceProvider->ProcessRequest(*request, resource); - } -} - -} // namespace lsst::qserv::xrdreq diff --git a/src/xrdreq/QueryManagementAction.h b/src/xrdreq/QueryManagementAction.h deleted file mode 100644 index c624ecf88..000000000 --- a/src/xrdreq/QueryManagementAction.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_ACTION_H -#define LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_ACTION_H - -// System headers -#include -#include -#include -#include -#include - -// Qserv headers -#include "global/intTypes.h" -#include "proto/worker.pb.h" - -namespace lsst::qserv::xrdreq { - -/** - * Class QueryManagementAction is an interface for managing query completion/cancellation - * at all Qserv workers that are connected as "publishers" to the XROOTD redirector. - */ -// &&&QM need to get the same functionality using json messages, and not in xrdreq. -class QueryManagementAction : public std::enable_shared_from_this { -public: - /// The reponse type represents errors reported by the workers, where worker - /// names are the keys. And the values are the error messages. Empty strings - /// indicate the succesful completion of the requests. - using Response = std::map; - - /// The callback function type to be used for notifications on the operation completion. - using CallbackType = std::function; - - /** - * The front-end method for initiating the operation at all workers. - * - * @note The only way to track the completion of the requests sent via - * this interface is by providing the callback function. The request delivery - * is not guaranteeded in case if the XROOTD/SSI network will be clogged by - * the heavy traffic. It's safe to call the same operation many times if needed. - * - * @param xrootdFrontendUrl A location of the XROOTD redirector. - * @param op An operation be initiated at the workers. - * @param onFinish The optional callback to be fired upon the completion of - * the requested operation. - * - * @throws std::runtime_error For failures encountered when connecting to - * the manager or initiating the requesed operation. - */ - static void notifyAllWorkers(std::string const& xrootdFrontendUrl, proto::QueryManagement::Operation op, - uint32_t czarId, QueryId queryId, CallbackType onFinish = nullptr); - - QueryManagementAction(QueryManagementAction const&) = delete; - QueryManagementAction& operator=(QueryManagementAction const&) = delete; - virtual ~QueryManagementAction(); - -private: - QueryManagementAction(); - - /** - * The actual implementation of the request processor. - * @see QueryManagementAction::notifyAllWorkers() - */ - void _notifyAllWorkers(std::string const& xrootdFrontendUrl, proto::QueryManagement::Operation op, - uint32_t czarId, QueryId queryId, CallbackType onFinish); - - /// The collection of worker responses. - Response _response; - - /// The counter will get incremented as worker responses will be received. - /// User-provided callback function (if any) will be called when all requests - /// will finish (succeed or fail). - std::atomic _numWorkerRequestsFinished{0}; -}; - -} // namespace lsst::qserv::xrdreq - -#endif // LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_ACTION_H diff --git a/src/xrdreq/QueryManagementRequest.cc b/src/xrdreq/QueryManagementRequest.cc deleted file mode 100644 index 82860cdd5..000000000 --- a/src/xrdreq/QueryManagementRequest.cc +++ /dev/null @@ -1,91 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "xrdreq/QueryManagementRequest.h" - -// LSST headers -#include "lsst/log/Log.h" - -using namespace std; - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdreq.QueryManagementRequest"); -} // namespace - -namespace lsst::qserv::xrdreq { - -QueryManagementRequest::Ptr QueryManagementRequest::create(proto::QueryManagement::Operation op, - uint32_t czarId, QueryId queryId, - QueryManagementRequest::CallbackType onFinish) { - QueryManagementRequest::Ptr ptr(new QueryManagementRequest(op, czarId, queryId, onFinish)); - ptr->setRefToSelf4keepAlive(ptr); - return ptr; -} - -QueryManagementRequest::QueryManagementRequest(proto::QueryManagement::Operation op, uint32_t czarId, - QueryId queryId, QueryManagementRequest::CallbackType onFinish) - : _op(op), _czarId(czarId), _queryId(queryId), _onFinish(onFinish) { - LOGS(_log, LOG_LVL_TRACE, "QueryManagementRequest ** CONSTRUCTED **"); -} - -QueryManagementRequest::~QueryManagementRequest() { - LOGS(_log, LOG_LVL_TRACE, "QueryManagementRequest ** DELETED **"); -} - -void QueryManagementRequest::onRequest(proto::FrameBuffer& buf) { - proto::QueryManagement message; - message.set_op(_op); - message.set_czar_id(_czarId); - message.set_query_id(_queryId); - buf.serialize(message); -} - -void QueryManagementRequest::onResponse(proto::FrameBufferView& view) { - if (nullptr != _onFinish) { - // Clearing the stored callback after finishing the up-stream notification - // has two purposes: - // - // 1. it guaranties (exactly) one time notification - // 2. it breaks the up-stream dependency on a caller object if a shared - // pointer to the object was mentioned as the lambda-function's closure - auto onFinish = move(_onFinish); - _onFinish = nullptr; - onFinish(proto::WorkerCommandStatus::SUCCESS, string()); - } -} - -void QueryManagementRequest::onError(string const& error) { - if (nullptr != _onFinish) { - // Clearing the stored callback after finishing the up-stream notification - // has two purposes: - // - // 1. it guaranties (exactly) one time notification - // 2. it breaks the up-stream dependency on a caller object if a shared - // pointer to the object was mentioned as the lambda-function's closure - auto onFinish = move(_onFinish); - _onFinish = nullptr; - onFinish(proto::WorkerCommandStatus::ERROR, error); - } -} - -} // namespace lsst::qserv::xrdreq diff --git a/src/xrdreq/QueryManagementRequest.h b/src/xrdreq/QueryManagementRequest.h deleted file mode 100644 index 0e366afe2..000000000 --- a/src/xrdreq/QueryManagementRequest.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_REQUEST_H -#define LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_REQUEST_H - -// System headers -#include -#include -#include - -// Qserv headers -#include "global/intTypes.h" -#include "proto/worker.pb.h" -#include "xrdreq/QservRequest.h" - -namespace lsst::qserv::xrdreq { - -/** - * Class QueryManagementRequest represents requests for managing query - * completion/cancellation at Qserv workers. - * @note No actuall responses are expected from these requests beyond - * the error messages in case of any problems in delivering or processing - * notifications. - */ -class QueryManagementRequest : public QservRequest { //&&&QM -public: - /// The pointer type for instances of the class - typedef std::shared_ptr Ptr; - - /// The callback function type to be used for notifications on - /// the operation completion. - using CallbackType = std::function; // error message (if failed) - - /** - * Static factory method is needed to prevent issues with the lifespan - * and memory management of instances created otherwise (as values or via - * low-level pointers). - * @param op An operation to be initiated. - * @param queryId An uinque identifier of a query affected by the request. - * Note that a cole of the identifier depends on which operation - * was requested. - * @param onFinish (optional) callback function to be called upon the completion - * (successful or not) of the request. - * @return the smart pointer to the object of the class - */ - static Ptr create(proto::QueryManagement::Operation op, uint32_t czarId, QueryId queryId, - CallbackType onFinish = nullptr); - - QueryManagementRequest() = delete; - QueryManagementRequest(QueryManagementRequest const&) = delete; - QueryManagementRequest& operator=(QueryManagementRequest const&) = delete; - - virtual ~QueryManagementRequest() override; - -protected: - /// @see QueryManagementRequest::create() - QueryManagementRequest(proto::QueryManagement::Operation op, uint32_t czarId, QueryId queryId, - CallbackType onFinish); - - virtual void onRequest(proto::FrameBuffer& buf) override; - virtual void onResponse(proto::FrameBufferView& view) override; - virtual void onError(std::string const& error) override; - -private: - // Parameters of the object - - proto::QueryManagement::Operation _op = proto::QueryManagement::CANCEL_AFTER_RESTART; - uint32_t _czarId = 0; - QueryId _queryId = 0; - CallbackType _onFinish; -}; - -} // namespace lsst::qserv::xrdreq - -#endif // LSST_QSERV_XRDREQ_QUERY_MANAGEMENT_REQUEST_H diff --git a/src/xrdreq/qserv-query-management.cc b/src/xrdreq/qserv-query-management.cc deleted file mode 100644 index 0e410ff5e..000000000 --- a/src/xrdreq/qserv-query-management.cc +++ /dev/null @@ -1,154 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -// System header -#include -#include -#include -#include -#include - -// Third party headers -#include "XrdSsi/XrdSsiProvider.hh" -#include "XrdSsi/XrdSsiService.hh" - -// Qserv headers -#include "global/intTypes.h" -#include "proto/worker.pb.h" -#include "util/BlockPost.h" -#include "util/CmdLineParser.h" -#include "xrdreq/QueryManagementAction.h" -#include "xrdreq/QueryManagementRequest.h" - -/// This C++ symbol is provided by the SSI shared library -extern XrdSsiProvider* XrdSsiProviderClient; - -namespace global = lsst::qserv; -namespace proto = lsst::qserv::proto; -namespace util = lsst::qserv::util; -namespace xrdreq = lsst::qserv::xrdreq; - -using namespace std; - -namespace { - -// Command line parameters - -vector const allowedOperations = {"CANCEL_AFTER_RESTART", "CANCEL", "COMPLETE"}; -proto::QueryManagement::Operation operation = proto::QueryManagement::CANCEL_AFTER_RESTART; -uint32_t czarId; -global::QueryId queryId; -bool allWorkers = false; -string serviceProviderLocation; - -proto::QueryManagement::Operation str2operation(string const& str) { - if (str == "CANCEL_AFTER_RESTART") { - return proto::QueryManagement::CANCEL_AFTER_RESTART; - } else if (str == "CANCEL") { - return proto::QueryManagement::CANCEL; - } else if (str == "COMPLETE") { - return proto::QueryManagement::COMPLETE; - } - throw invalid_argument("error: unknown operation '" + str + "'"); -} - -int test() { - bool finished = false; - if (allWorkers) { - xrdreq::QueryManagementAction::notifyAllWorkers( - serviceProviderLocation, operation, czarId, queryId, - [&finished](xrdreq::QueryManagementAction::Response const& response) { - for (auto itr : response) { - cout << "worker: " << itr.first << " error: " << itr.second << endl; - } - finished = true; - }); - } else { - // Connect to a service provider - XrdSsiErrInfo errInfo; - auto serviceProvider = XrdSsiProviderClient->GetService(errInfo, serviceProviderLocation); - if (nullptr == serviceProvider) { - cerr << "failed to contact service provider at: " << serviceProviderLocation - << ", error: " << errInfo.Get() << endl; - return 1; - } - cout << "connected to service provider at: " << serviceProviderLocation << endl; - - // Prepare the request - auto request = xrdreq::QueryManagementRequest::create( - operation, czarId, queryId, - [&finished](proto::WorkerCommandStatus::Code code, string const& error) { - cout << "code=" << proto::WorkerCommandStatus_Code_Name(code) << ", error='" << error - << "'" << endl; - finished = true; - }); - - // Submit the request - XrdSsiResource resource("/query"); - serviceProvider->ProcessRequest(*request, resource); - } - - // Wait before the request will finish or fail - util::BlockPost blockPost(1000, 2000); - while (!finished) { - blockPost.wait(200); - } - return 0; -} -} // namespace - -int main(int argc, const char* const argv[]) { - // Verify that the version of the library that we linked against is - // compatible with the version of the headers we compiled against. - - GOOGLE_PROTOBUF_VERIFY_VERSION; - - // Parse command line parameters - try { - util::CmdLineParser parser( - argc, argv, - "\n" - "Usage:\n" - " \n" - " [--service=]\n" - "\n" - "Flags an options:\n" - " --all-workers - The flag indicating if the operation had to involve all workers.\n" - " --service= - A location of the service provider (default: 'localhost:1094').\n" - "\n" - "Parameters:\n" - " - An operation over the query (queries). Allowed values of\n" - " the parameter are: CANCEL_AFTER_RESTART, CANCEL, COMPLETE.\n" - " - The unique identifier of Czar.\n" - " - User query identifier.\n"); - - ::operation = ::str2operation(parser.parameterRestrictedBy(1, ::allowedOperations)); - ::czarId = parser.parameter(2); - ::queryId = parser.parameter(3); - ::allWorkers = parser.flag("all-workers"); - ::serviceProviderLocation = parser.option("service", "localhost:1094"); - - } catch (exception const& ex) { - cerr << ex.what() << endl; - return 1; - } - return ::test(); -} diff --git a/src/xrdsvc/CMakeLists.txt b/src/xrdsvc/CMakeLists.txt index 072fdd99f..d650acb9d 100644 --- a/src/xrdsvc/CMakeLists.txt +++ b/src/xrdsvc/CMakeLists.txt @@ -2,16 +2,13 @@ add_library(qserv_xrdsvc OBJECT) add_dependencies(qserv_xrdsvc proto) target_sources(qserv_xrdsvc PRIVATE - ChannelStream.cc HttpModule.cc HttpMonitorModule.cc HttpReplicaMgtModule.cc HttpWorkerCzarModule.cc HttpSvc.cc SsiProvider.cc - SsiRequest.cc SsiService.cc - StreamBuffer.cc ) target_include_directories(qserv_xrdsvc PRIVATE diff --git a/src/xrdsvc/ChannelStream.cc b/src/xrdsvc/ChannelStream.cc deleted file mode 100644 index 2c02610b4..000000000 --- a/src/xrdsvc/ChannelStream.cc +++ /dev/null @@ -1,115 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2014-2016 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "xrdsvc/ChannelStream.h" - -// Third-party headers -#include "boost/utility.hpp" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "global/debugUtil.h" -#include "util/Bug.h" -#include "util/common.h" - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdsvc.ChannelStream"); -} - -using namespace std; - -namespace lsst::qserv::xrdsvc { - -/// Provide each Channel stream with a unique identifier. -atomic ChannelStream::_sequenceSource{0}; - -/// Constructor -ChannelStream::ChannelStream() : XrdSsiStream(isActive), _closed(false), _seq(_sequenceSource++) {} - -/// Destructor -ChannelStream::~ChannelStream() { clearMsgs(); } - -/// Push in a data packet -void ChannelStream::append(StreamBuffer::Ptr const &streamBuffer, bool last) { - if (_closed) { - throw util::Bug(ERR_LOC, - "ChannelStream::append: Stream closed, append(...,last=true) already received"); - } - LOGS(_log, LOG_LVL_DEBUG, - "seq=" << _seq << " ChannelStream::append last=" << last << " " - << util::prettyCharBuf(streamBuffer->data, streamBuffer->getSize(), 5)); - { - unique_lock lock(_mutex); - ++_appendCount; - LOGS(_log, LOG_LVL_DEBUG, - "seq=" << to_string(_seq) << " Trying to append message (flowing) appC=" << _appendCount - << " getBC=" << _getBufCount); - _msgs.push_back(streamBuffer); - _closed = last; // if last is true, then we are closed. - } - _hasDataCondition.notify_one(); -} - -/// Pull out a data packet as a Buffer object (called by XrdSsi code) -XrdSsiStream::Buffer *ChannelStream::GetBuff(XrdSsiErrInfo &eInfo, int &dlen, bool &last) { - ++_getBufCount; - // This InstanceCount should be fairly quiet as there should only be one at a time. - util::InstanceCount inst("GetBuf seq=" + to_string(_seq)); - unique_lock lock(_mutex); - while (_msgs.empty() && !_closed) { // No msgs, but we aren't done - // wait. - LOGS(_log, LOG_LVL_INFO, "seq=" << _seq << " Waiting, no data ready "); - _hasDataCondition.wait(lock); - } - if (_msgs.empty() && _closed) { - // It's closed and no more msgs are available. - LOGS(_log, LOG_LVL_INFO, "seq=" << _seq << " Not waiting, but closed"); - dlen = 0; - eInfo.Set("Not an active stream", EOPNOTSUPP); - return 0; - } - - StreamBuffer::Ptr sb = _msgs.front(); - dlen = sb->getSize(); - _msgs.pop_front(); - last = _closed && _msgs.empty(); - LOGS(_log, LOG_LVL_INFO, - "seq=" << to_string(_seq) << " returning buffer (" << dlen << ", " << (last ? "(last)" : "(more)") - << ")" - << " getBufCount=" << _getBufCount); - return sb.get(); -} - -void ChannelStream::clearMsgs() { - LOGS(_log, LOG_LVL_DEBUG, "seq=" << to_string(_seq) << " ChannelStream::clearMsgs()"); - unique_lock lock(_mutex); - while (!_msgs.empty()) { - _msgs.front()->Recycle(); - _msgs.pop_front(); - } -} - -} // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/ChannelStream.h b/src/xrdsvc/ChannelStream.h deleted file mode 100644 index db9290fb9..000000000 --- a/src/xrdsvc/ChannelStream.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2014-2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDSVC_CHANNELSTREAM_H -#define LSST_QSERV_XRDSVC_CHANNELSTREAM_H - -// System headers -#include -#include -#include -#include - -// qserv headers -#include "xrdsvc/StreamBuffer.h" - -// Third-party headers -#include "XrdSsi/XrdSsiErrInfo.hh" // required by XrdSsiStream -#include "XrdSsi/XrdSsiStream.hh" - -namespace lsst::qserv::xrdsvc { - -/// ChannelStream is an implementation of an XrdSsiStream that accepts -/// SendChannel streamed data. -class ChannelStream : public XrdSsiStream { // &&& delete -public: - ChannelStream(); - virtual ~ChannelStream(); - - /// Push in a data packet - void append(StreamBuffer::Ptr const &StreamBuffer, bool last); - - /// Empty _msgs, calling StreamBuffer::Recycle() where needed. - void clearMsgs(); - - /// Pull out a data packet as a Buffer object (called by XrdSsi code) - Buffer *GetBuff(XrdSsiErrInfo &eInfo, int &dlen, bool &last) override; - - bool closed() const { return _closed; } - - uint64_t getSeq() const { return _seq; } - -private: - bool _closed; ///< Closed to new append() calls? - // Can keep a deque of (buf, bufsize) to reduce copying, if needed. - std::deque _msgs; ///< Message queue - std::mutex _mutex; ///< _msgs protection - std::condition_variable _hasDataCondition; ///< _msgs condition - uint64_t const _seq; ///< Unique identifier for this instance. - static std::atomic _sequenceSource; ///< Source of unique identifiers. - std::atomic _appendCount{0}; ///< number of appends - std::atomic _getBufCount{0}; ///< number of buffers -}; - -} // namespace lsst::qserv::xrdsvc - -#endif // LSST_QSERV_XRDSVC_CHANNELSTREAM_H diff --git a/src/xrdsvc/HttpReplicaMgtModule.cc b/src/xrdsvc/HttpReplicaMgtModule.cc index 14fdde32a..e7d61d95b 100644 --- a/src/xrdsvc/HttpReplicaMgtModule.cc +++ b/src/xrdsvc/HttpReplicaMgtModule.cc @@ -78,10 +78,13 @@ HttpReplicaMgtModule::HttpReplicaMgtModule(string const& context, shared_ptr const& foreman, shared_ptr const& req, shared_ptr const& resp) + : HttpModule(context, foreman, req, resp) {} +/* &&& : HttpModule(context, foreman, req, resp), _providerServer(dynamic_cast(XrdSsiProviderLookup)), _clusterManager(_providerServer->GetClusterManager()), _dataContext(_clusterManager->DataContext()) {} + */ json HttpReplicaMgtModule::executeImpl(string const& subModuleName) { string const func = string(__func__) + "[sub-module='" + subModuleName + "']"; @@ -334,12 +337,12 @@ void HttpReplicaMgtModule::_modifyChunk(string const& func, int chunk, string co // copy of the inventory. After that modify both (persistent and // transient) inventories. if (Direction::ADD == direction) { - _clusterManager->Added(resource.data()); - if (_dataContext) _providerServer->GetChunkInventory().add(database, chunk); + //&&&_clusterManager->Added(resource.data()); + //&&&if (_dataContext) _providerServer->GetChunkInventory().add(database, chunk); foreman()->chunkInventory()->add(database, chunk, foreman()->mySqlConfig()); } else { - _clusterManager->Removed(resource.data()); - if (_dataContext) _providerServer->GetChunkInventory().remove(database, chunk); + //&&&_clusterManager->Removed(resource.data()); + //&&&if (_dataContext) _providerServer->GetChunkInventory().remove(database, chunk); foreman()->chunkInventory()->remove(database, chunk, foreman()->mySqlConfig()); } } catch (wpublish::InvalidParamError const& ex) { diff --git a/src/xrdsvc/HttpReplicaMgtModule.h b/src/xrdsvc/HttpReplicaMgtModule.h index efda8acff..b089d069c 100644 --- a/src/xrdsvc/HttpReplicaMgtModule.h +++ b/src/xrdsvc/HttpReplicaMgtModule.h @@ -184,11 +184,7 @@ class HttpReplicaMgtModule : public xrdsvc::HttpModule { */ void _modifyChunk(std::string const& func, int chunk, std::string const& database, Direction direction); - // XROOTD/SSI service context. - - xrdsvc::SsiProviderServer* _providerServer = nullptr; - XrdSsiCluster* _clusterManager = nullptr; - bool _dataContext = false; + bool _dataContext = false; // &&& }; } // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 3408aa4cd..97cb34b54 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -243,8 +243,8 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { auto const replicationAuthKey = workerConfig->replicationAuthKey(); auto const& jsReq = body().objJson; - auto wqsData = - http::WorkerQueryStatusData::createJson(jsReq, replicationInstanceId, replicationAuthKey, now); + auto wqsData = http::WorkerQueryStatusData::createFromJson(jsReq, replicationInstanceId, + replicationAuthKey, now); // For all queryId and czarId items, if the item can't be found, it is simply ignored. Anything that // is missed will eventually be picked up by other mechanisms, such as results being rejected @@ -265,8 +265,8 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { auto const queriesAndChunks = foreman()->queriesAndChunks(); vector cancelledList; // Cancelled queries where we want to keep the files - lock_guard mapLg(wqsData->_mapMtx); - for (auto const& [dkQid, dkTm] : wqsData->_qIdDoneKeepFiles) { + lock_guard mapLg(wqsData->mapMtx); + for (auto const& [dkQid, dkTm] : wqsData->qIdDoneKeepFiles) { auto qStats = queriesAndChunks->addQueryId(dkQid); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); @@ -279,7 +279,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { } vector deleteFilesList; - for (auto const& [dkQid, dkTm] : wqsData->_qIdDoneDeleteFiles) { + for (auto const& [dkQid, dkTm] : wqsData->qIdDoneDeleteFiles) { auto qStats = queriesAndChunks->addQueryId(dkQid); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); @@ -301,7 +301,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // UserQueryInfo will cancel the tasks in the uberjobs if they exist. // New UberJob Id's will be checked against the list, and immediately be // killed if they are on it. (see HttpWorkerCzarModule::_handleQueryJob) - for (auto const& [ujQid, ujIdMap] : wqsData->_qIdDeadUberJobs) { + for (auto const& [ujQid, ujIdMap] : wqsData->qIdDeadUberJobs) { auto qStats = queriesAndChunks->addQueryId(ujQid); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); @@ -316,7 +316,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { } // Delete files that should be deleted - CzarIdType czarId = wqsData->_czInfo->czId; + CzarIdType czarId = wqsData->getCzInfo()->czId; for (wbase::UserQueryInfo::Ptr uqiPtr : deleteFilesList) { if (uqiPtr == nullptr) continue; QueryId qId = uqiPtr->getQueryId(); @@ -326,7 +326,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // Syntax errors in the message would throw invalid_argument, which is handled elsewhere. // Return a message containing lists of the queries that were cancelled. - jsRet = wqsData->serializeResponseJson(); + jsRet = wqsData->serializeResponseJson(foreman()->getWorkerStartupTime()); return jsRet; } diff --git a/src/xrdsvc/SsiRequest.cc b/src/xrdsvc/SsiRequest.cc deleted file mode 100644 index 724c098f9..000000000 --- a/src/xrdsvc/SsiRequest.cc +++ /dev/null @@ -1,349 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015-2016 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include -#include -#include -#include -#include -#include - -// Third-party headers -#include "XrdSsi/XrdSsiRequest.hh" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "global/intTypes.h" -#include "global/LogContext.h" -#include "global/ResourceUnit.h" -#include "proto/FrameBuffer.h" -#include "proto/worker.pb.h" -#include "util/InstanceCount.h" -#include "util/HoldTrack.h" -#include "util/Timer.h" -#include "wbase/FileChannelShared.h" -#include "wbase/TaskState.h" -#include "wbase/Task.h" -#include "wconfig/WorkerConfig.h" -#include "wcontrol/Foreman.h" -#include "wcontrol/ResourceMonitor.h" -#include "wpublish/ChunkInventory.h" -#include "xrdsvc/ChannelStream.h" - -namespace proto = lsst::qserv::proto; -namespace wbase = lsst::qserv::wbase; - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdsvc.SsiRequest"); - -} // namespace - -namespace lsst::qserv::xrdsvc { - -SsiRequest::Ptr SsiRequest::newSsiRequest(std::string const& rname, - std::shared_ptr const& foreman) { - auto req = SsiRequest::Ptr(new SsiRequest(rname, foreman)); - req->_selfKeepAlive = req; - return req; -} - -SsiRequest::SsiRequest(std::string const& rname, std::shared_ptr const& foreman) - : _validator(foreman->chunkInventory()->newValidator()), _foreman(foreman), _resourceName(rname) {} - -SsiRequest::~SsiRequest() { - LOGS(_log, LOG_LVL_DEBUG, "~SsiRequest()"); - UnBindRequest(); -} - -void SsiRequest::reportError(std::string const& errStr) { - LOGS(_log, LOG_LVL_WARN, errStr); - replyError(errStr, EINVAL); - ReleaseRequestBuffer(); -} - -uint64_t countLimiter = 0; // LockupDB - -// Step 4 -/// Called by XrdSsi to actually process a request. -void SsiRequest::execute(XrdSsiRequest& req) { - util::Timer t; - LOGS(_log, LOG_LVL_DEBUG, "Execute request, resource=" << _resourceName); - - // We bind this object to the request now. This allows us to respond at any - // time (much simpler). Though the manual forgot to say that all pending - // events will be reflected on a different thread the moment we bind the - // request; the fact allows us to use a mutex to serialize the order of - // initialization and possible early cancellation. We protect this code - // with a mutex gaurd which will be released upon exit. - // - std::lock_guard lock(_finMutex); - BindRequest(req); - - ResourceUnit ru(_resourceName); - - // Make sure the requested resource belongs to this worker - if (!(*_validator)(ru)) { - reportError("WARNING: request to the unowned resource detected:" + _resourceName); - return; - } - - auto const sendChannel = std::make_shared(shared_from_this()); - - // Process the request - switch (ru.unitType()) { - case ResourceUnit::DBCHUNK: { // &&& delete - // Increment the counter of the database/chunk resources in use - _foreman->resourceMonitor()->increment( - _resourceName); // &&& TODO:UJ make sure this is implemented elsewhere. - - reportError("&&& DBCHUNK requests are no longer available resource db=" + ru.db() + - " chunkId=" + std::to_string(ru.chunk())); - throw util::Bug(ERR_LOC, "&&& ResourceUnit::DBCHUNK"); - break; - } - case ResourceUnit::QUERY: { // &&& delete - LOGS(_log, LOG_LVL_DEBUG, "Parsing request details for resource=" << _resourceName); - - reportError("&&& QUERY requests are no longer available"); - - /* &&&QM - proto::QueryManagement request; - try { - // reqData has the entire request, so we can unpack it without waiting for - // more data. - proto::FrameBufferView view(reqData, reqSize); - view.parse(request); - ReleaseRequestBuffer(); - } catch (proto::FrameBufferError const& ex) { - reportError("Failed to decode a query completion/cancellation command, error: " + - std::string(ex.what())); - break; - } - LOGS(_log, LOG_LVL_DEBUG, - "QueryManagement: op=" << proto::QueryManagement_Operation_Name(request.op()) - << " query_id=" << request.query_id()); - - switch (wconfig::WorkerConfig::instance()->resultDeliveryProtocol()) { - case wconfig::ConfigValResultDeliveryProtocol::XROOT: - case wconfig::ConfigValResultDeliveryProtocol::HTTP: - switch (request.op()) { - case proto::QueryManagement::CANCEL_AFTER_RESTART: - // TODO: locate and cancel the coresponding tasks, remove the tasks - // from the scheduler queues. - wbase::FileChannelShared::cleanUpResultsOnCzarRestart(request.czar_id(), - request.query_id()); - break; - case proto::QueryManagement::CANCEL: - // TODO: locate and cancel the coresponding tasks, remove the tasks - // from the scheduler queues. - wbase::FileChannelShared::cleanUpResults(request.czar_id(), request.query_id()); - break; - case proto::QueryManagement::COMPLETE: - wbase::FileChannelShared::cleanUpResults(request.czar_id(), request.query_id()); - break; - default: - reportError("QueryManagement: op=" + - proto::QueryManagement_Operation_Name(request.op()) + - " is not supported by the current implementation."); - return; - } - break; - default: - throw std::runtime_error("SsiRequest::" + std::string(__func__) + - " unsupported result delivery protocol"); - } - - // Send back the empty response since no info is expected by a caller - // for this type of requests beyond the usual error notifications (if any). - this->reply((char const*)0, 0); - */ - break; - } - default: - reportError("Unexpected unit type '" + std::to_string(ru.unitType()) + - "', resource name: " + _resourceName); - } - // Note that upon exit the _finMutex will be unlocked allowing Finished() - // to actually do something once everything is actually setup. -} - -/// Called by SSI to free resources. -void SsiRequest::Finished(XrdSsiRequest& req, XrdSsiRespInfo const& rinfo, bool cancel) { // Step 8 - util::HoldTrack::Mark markA(ERR_LOC, "SsiRequest::Finished start"); - if (cancel) { - // Either the czar of xrootd has decided to cancel the Job. - // Try to cancel all of the tasks, if there are any. - for (auto&& wTask : _tasks) { - auto task = wTask.lock(); - if (task != nullptr) { - task->cancel(); - } - } - } - - // This call is sync (blocking). - // client finished retrieving response, or cancelled. - // release response resources (e.g. buf) - // But first we must make sure that request setup completed (i.e execute()) by - // locking _finMutex. - { - std::lock_guard finLock(_finMutex); - // Clean up _stream if it exists and don't add anything new to it either. - _reqFinished = true; - if (_stream != nullptr) { - _stream->clearMsgs(); - } - } - - // This will clear the cyclic dependency: - // FileChannelShared -> ChannelStream -> SsiRequest -> FileChannelShared - // - // TODO: Eliminate xrdsvc::ChannelStream sinve this class seems to be useless - // in the file-based result delivery protocol. - _channelShared.reset(); - - auto keepAlive = freeSelfKeepAlive(); - - // No buffers allocated, so don't need to free. - // We can release/unlink the file now - const char* type = ""; - switch (rinfo.rType) { - case XrdSsiRespInfo::isNone: - type = "type=isNone"; - break; - case XrdSsiRespInfo::isData: - type = "type=isData"; - break; - case XrdSsiRespInfo::isError: - type = "type=isError"; - break; - case XrdSsiRespInfo::isFile: - type = "type=isFile"; - break; - case XrdSsiRespInfo::isStream: - type = "type=isStream"; - break; - case XrdSsiRespInfo::isHandle: - type = "type=isHandle"; - break; - } - - // Decrement the counter of the database/chunk resources in use - ResourceUnit ru(_resourceName); - if (ru.unitType() == ResourceUnit::DBCHUNK) { - _foreman->resourceMonitor()->decrement(_resourceName); - } - - // We can't do much other than close the file. - // It should work (on linux) to unlink the file after we open it, though. - // With the optimizer on '-Og', there was a double free for a SsiRequest. - // The likely cause could be keepAlive being optimized out for being unused. - // The problem has not reoccurred since adding keepAlive to the following - // comment, but having code depend on a comment line is ugly in its own way. - LOGS(_log, LOG_LVL_DEBUG, "RequestFinished " << type << " " << keepAlive.use_count()); -} - -bool SsiRequest::reply(char const* buf, int bufLen) { - Status s = SetResponse(buf, bufLen); - if (s != XrdSsiResponder::wasPosted) { - LOGS(_log, LOG_LVL_ERROR, "DANGER: Couldn't post response of length=" << bufLen); - return false; - } - return true; -} - -bool SsiRequest::replyError(std::string const& msg, int code) { - Status s = SetErrResponse(msg.c_str(), code); - if (s != XrdSsiResponder::wasPosted) { - LOGS(_log, LOG_LVL_ERROR, "DANGER: Couldn't post error response " << msg); - return false; - } - return true; -} - -bool SsiRequest::replyStream(StreamBuffer::Ptr const& sBuf, bool last) { - LOGS(_log, LOG_LVL_DEBUG, "replyStream, checking stream size=" << sBuf->getSize() << " last=" << last); - - // Normally, XrdSsi would call Recycle() when it is done with sBuf, but if this function - // returns false, then it must call Recycle(). Otherwise, the scheduler will likely - // wedge waiting for the buffer to be released. - std::lock_guard finLock(_finMutex); - if (_reqFinished) { - // Finished() was called, give up. - LOGS(_log, LOG_LVL_ERROR, "replyStream called after reqFinished."); - sBuf->Recycle(); - return false; - } - // Create a stream if needed. - if (!_stream) { - _stream = std::make_shared(); - if (SetResponse(_stream.get()) != XrdSsiResponder::Status::wasPosted) { - LOGS(_log, LOG_LVL_WARN, "SetResponse stream failed, calling Recycle for sBuf"); - // SetResponse return value indicates XrdSsi wont call Recycle(). - sBuf->Recycle(); - return false; - } - } else if (_stream->closed()) { - // XrdSsi isn't going to call Recycle if we wind up here. - LOGS(_log, LOG_LVL_ERROR, "Logic error SsiRequest::replyStream called with stream closed."); - sBuf->Recycle(); - return false; - } - // XrdSsi or Finished() will call Recycle(). - LOGS(_log, LOG_LVL_INFO, "SsiRequest::replyStream seq=" << getSeq()); - _stream->append(sBuf, last); - return true; -} - -bool SsiRequest::sendMetadata(const char* buf, int blen) { - Status stat = SetMetadata(buf, blen); - switch (stat) { - case XrdSsiResponder::wasPosted: - return true; - case XrdSsiResponder::notActive: - LOGS(_log, LOG_LVL_ERROR, "failed to " << __func__ << " notActive"); - break; - case XrdSsiResponder::notPosted: - LOGS(_log, LOG_LVL_ERROR, "failed to " << __func__ << " notPosted blen=" << blen); - break; - default: - LOGS(_log, LOG_LVL_ERROR, "failed to " << __func__ << " unkown state blen=" << blen); - } - return false; -} - -SsiRequest::Ptr SsiRequest::freeSelfKeepAlive() { - Ptr keepAlive = std::move(_selfKeepAlive); - return keepAlive; -} - -uint64_t SsiRequest::getSeq() const { - if (_stream == nullptr) return 0; - return _stream->getSeq(); -} - -} // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/SsiRequest.h b/src/xrdsvc/SsiRequest.h deleted file mode 100644 index 29a600bd3..000000000 --- a/src/xrdsvc/SsiRequest.h +++ /dev/null @@ -1,129 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDSVC_SSIREQUEST_H -#define LSST_QSERV_XRDSVC_SSIREQUEST_H - -// System headers -#include -#include -#include -#include - -// Third-party headers -#include "XrdSsi/XrdSsiResponder.hh" - -// Qserv headers -#include "global/ResourceUnit.h" -#include "mysql/MySqlConfig.h" -#include "xrdsvc/StreamBuffer.h" - -// Forward declarations -class XrdSsiService; - -namespace lsst::qserv { -namespace wbase { -class FileChannelShared; -class SendChannel; -class Task; -} // namespace wbase -namespace wcontrol { -class Foreman; -} -} // namespace lsst::qserv - -namespace lsst::qserv::xrdsvc { - -class ChannelStream; -class StreamBuffer; - -/// An implementation of XrdSsiResponder that is used by SsiService to provide -/// qserv worker services. The SSI interface encourages such an approach, and -/// object lifetimes are explicitly stated in the documentation which we -/// adhere to using BindRequest() and UnBindRequest() responder methods. -class SsiRequest : public XrdSsiResponder, - public std::enable_shared_from_this { // &&& delete if possible -public: - // Smart pointer definitions - - typedef std::shared_ptr ValidatorPtr; - typedef std::shared_ptr Ptr; - - /// Use factory to ensure proper construction for enable_shared_from_this. - static SsiRequest::Ptr newSsiRequest(std::string const& rname, - std::shared_ptr const& processor); - - virtual ~SsiRequest(); - - void execute(XrdSsiRequest& req); - - /** - * Implements the virtual method defined in the base class - * @see XrdSsiResponder::Finished - */ - void Finished(XrdSsiRequest& req, XrdSsiRespInfo const& rinfo, bool cancel = false) override; - - bool isFinished() { return _reqFinished; } - - bool reply(char const* buf, int bufLen); - bool replyError(std::string const& msg, int code); - bool replyStream(StreamBuffer::Ptr const& sbuf, bool last); - - bool sendMetadata(const char* buf, int blen); - - /// Call this to allow object to die after it truly is no longer needed. - /// i.e. It is know Finish() will not be called. - /// NOTE: It is important that any non-static SsiRequest member - /// function make a local copy of the returned pointer so that - /// SsiRequest is guaranteed to live to the end of - /// the function call. - Ptr freeSelfKeepAlive(); - - uint64_t getSeq() const; - -private: - /// Constructor (called by the static factory method newSsiRequest) - SsiRequest(std::string const& rname, std::shared_ptr const& processor); - - /// For internal error reporting - void reportError(std::string const& errStr); - -private: - ValidatorPtr _validator; ///< validates request against what's available - std::shared_ptr const _foreman; ///< actual msg processor - - std::mutex _finMutex; ///< Protects execute() from Finish(), _finished, and _stream - std::atomic _reqFinished{false}; ///< set to true when Finished called - std::string _resourceName; ///< chunk identifier - - std::shared_ptr _stream; - std::shared_ptr _channelShared; ///< Must live before Finished() gets called. - std::vector> _tasks; ///< List of tasks for use in cancellation. - - /// Make sure this object exists until Finish() is called. - /// Make a local copy before calling reset() within and non-static member function. - Ptr _selfKeepAlive; -}; - -} // namespace lsst::qserv::xrdsvc - -#endif // LSST_QSERV_XRDSVC_SSIREQUEST_H diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index 473bd5f07..893f7c198 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -70,7 +70,6 @@ #include "wsched/GroupScheduler.h" #include "wsched/ScanScheduler.h" #include "xrdsvc/HttpSvc.h" -#include "xrdsvc/SsiRequest.h" #include "xrdsvc/XrdName.h" using namespace lsst::qserv; @@ -178,9 +177,6 @@ SsiService::SsiService(XrdSsiLogger* log) { throw wconfig::WorkerConfigError("Unrecognized memory manager."); } - int64_t bufferMaxTotalBytes = workerConfig->getBufferMaxTotalGB() * 1'000'000'000LL; - StreamBuffer::setMaxTotalBytes(bufferMaxTotalBytes); - // Set thread pool size. unsigned int poolSize = ranges::max({wsched::BlendScheduler::getMinPoolSize(), workerConfig->getThreadPoolSize(), thread::hardware_concurrency()}); @@ -240,8 +236,9 @@ SsiService::SsiService(XrdSsiLogger* log) { LOGS(_log, LOG_LVL_WARN, "config sqlConnMgr" << *sqlConnMgr); LOGS(_log, LOG_LVL_WARN, "maxPoolThreads=" << maxPoolThreads); - _foreman = make_shared(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, - ::makeChunkInventory(mySqlConfig), sqlConnMgr); + _foreman = wcontrol::Foreman::Ptr(new wcontrol::Foreman(blendSched, poolSize, maxPoolThreads, mySqlConfig, + queries, ::makeChunkInventory(mySqlConfig), + sqlConnMgr)); // Watch to see if the log configuration is changed. // If LSST_LOG_CONFIG is not defined, there's no good way to know what log @@ -282,6 +279,7 @@ SsiService::~SsiService() { } void SsiService::ProcessRequest(XrdSsiRequest& reqRef, XrdSsiResource& resRef) { +#if 0 //&&& LOGS(_log, LOG_LVL_DEBUG, "Got request call where rName is: " << resRef.rName); auto request = SsiRequest::newSsiRequest(resRef.rName, _foreman); @@ -289,6 +287,9 @@ void SsiService::ProcessRequest(XrdSsiRequest& reqRef, XrdSsiResource& resRef) { // Object deletes itself when finished is called. // request->execute(reqRef); +#else //&&& + LOGS(_log, LOG_LVL_ERROR, "SsiService::ProcessRequest got called"); +#endif //&&& } } // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/StreamBuffer.cc b/src/xrdsvc/StreamBuffer.cc deleted file mode 100644 index 2e9a9d3f2..000000000 --- a/src/xrdsvc/StreamBuffer.cc +++ /dev/null @@ -1,176 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "xrdsvc/StreamBuffer.h" - -// Third-party headers -#include "boost/utility.hpp" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "wbase/Task.h" -#include "wcontrol/WorkerStats.h" - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.xrdsvc.StreamBuffer"); -} - -using namespace std; - -namespace lsst::qserv::xrdsvc { - -atomic StreamBuffer::_maxTotalBytes{40'000'000'000}; -atomic StreamBuffer::_totalBytes(0); -mutex StreamBuffer::_createMtx; -condition_variable StreamBuffer::_createCv; - -void StreamBuffer::setMaxTotalBytes(int64_t maxBytes) { - string const context = "StreamBuffer::" + string(__func__) + " "; - LOGS(_log, LOG_LVL_INFO, context << "maxBytes=" << maxBytes); - if (maxBytes < 0) { - throw invalid_argument(context + "negative " + to_string(maxBytes)); - } - if (maxBytes < 1'000'000'000LL) { - LOGS(_log, LOG_LVL_ERROR, "Very small value for " << context << maxBytes); - } - _maxTotalBytes = maxBytes; -} - -double StreamBuffer::percentOfMaxTotalBytesUsed() { - double percent = ((double)_totalBytes) / ((double)_maxTotalBytes); - if (percent < 0.0) percent = 0.0; - if (percent > 1.0) percent = 1.0; - return percent; -} - -// Factory function, because this should be able to delete itself when Recycle() is called. -StreamBuffer::Ptr StreamBuffer::createWithMove(std::string &input, std::shared_ptr const &task) { - unique_lock uLock(_createMtx); - if (_totalBytes >= _maxTotalBytes) { - LOGS(_log, LOG_LVL_WARN, "StreamBuffer at memory limit " << _totalBytes); - } - _createCv.wait(uLock, []() { return _totalBytes < _maxTotalBytes; }); - Ptr ptr(new StreamBuffer(input, task)); - ptr->_selfKeepAlive = ptr; - return ptr; -} - -StreamBuffer::StreamBuffer(std::string &input, wbase::Task::Ptr const &task) : _task(task) { - _dataStr = std::move(input); - // TODO: try to make 'data' a const char* in xrootd code. - // 'data' is not being changed after being passed, so hopefully not an issue. - //_dataStr will not be used again, but this is ugly. - data = (char *)(_dataStr.data()); - next = 0; - - auto now = CLOCK::now(); - _createdTime = now; - _startTime = now; - _endTime = now; - - _wStats = wcontrol::WorkerStats::get(); - if (_wStats != nullptr) { - _wStats->startQueryRespConcurrentQueued(_createdTime); - } - - _totalBytes += _dataStr.size(); - LOGS(_log, LOG_LVL_DEBUG, "StreamBuffer::_totalBytes=" << _totalBytes << " thisSize=" << _dataStr.size()); -} - -StreamBuffer::~StreamBuffer() { - _totalBytes -= _dataStr.size(); - LOGS(_log, LOG_LVL_DEBUG, "~StreamBuffer::_totalBytes=" << _totalBytes); -} - -void StreamBuffer::startTimer() { - auto now = CLOCK::now(); - _startTime = now; - _endTime = now; - - if (_wStats != nullptr) { - _wStats->endQueryRespConcurrentQueued(_createdTime, _startTime); // add time to queued time - } -} - -/// xrdssi calls this to recycle the buffer when finished. -void StreamBuffer::Recycle() { - { - std::lock_guard lg(_mtx); - _doneWithThis = true; - } - _cv.notify_all(); - - _endTime = CLOCK::now(); - if (_wStats != nullptr) { - _wStats->endQueryRespConcurrentXrootd(_startTime, _endTime); - } - - if (_task != nullptr) { - auto taskSched = _task->getTaskScheduler(); - if (taskSched != nullptr) { - std::chrono::duration secs = _endTime - _startTime; - taskSched->histTimeOfTransmittingTasks->addEntry(secs.count()); - LOGS(_log, LOG_LVL_TRACE, "Recycle " << taskSched->histTimeOfTransmittingTasks->getJson()); - } else { - LOGS(_log, LOG_LVL_WARN, "Recycle transmit taskSched == nullptr"); - } - } else { - LOGS(_log, LOG_LVL_DEBUG, "Recycle transmit _task == nullptr"); - } - // Effectively reset _selfKeepAlive, and if nobody else was - // referencing this, this object will delete itself when - // this function is done. - // std::move is used instead of reset() as reset() could - // result in _keepalive deleting itself while still in use. - Ptr keepAlive = std::move(_selfKeepAlive); -} - -void StreamBuffer::cancel() { - // Recycle may still need to be called by XrdSsi or there will be a memory - // leak. XrdSsi calling Recycle is beyond what can be controlled here, but - // better a possible leak than corrupted memory or a permanently wedged - // thread in a limited pool. - // In any case, this code having an effect should be extremely rare. - // FUTURE: It would be nice to eliminate this possible memory leak. - // Possible fix, atomic _recycleCalled, create thread - // to check if _recycleCalled == true. If true or 24 hours pass - // use `Ptr keepAlive = std::move(_selfKeepAlive);` to kill the object. - { - std::lock_guard lg(_mtx); - _doneWithThis = true; - _cancelled = true; - } - _cv.notify_all(); -} - -// Wait until recycle is called. -bool StreamBuffer::waitForDoneWithThis() { - std::unique_lock uLock(_mtx); - _cv.wait(uLock, [this]() { return _doneWithThis || _cancelled; }); - return !_cancelled; -} - -} // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/StreamBuffer.h b/src/xrdsvc/StreamBuffer.h deleted file mode 100644 index 07a63d1b5..000000000 --- a/src/xrdsvc/StreamBuffer.h +++ /dev/null @@ -1,128 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2014-2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_XRDSVC_STREAMBUFFER_H -#define LSST_QSERV_XRDSVC_STREAMBUFFER_H - -// System headers -#include -#include -#include -#include -#include - -// qserv headers -#include "util/InstanceCount.h" - -// Third-party headers -#include "XrdSsi/XrdSsiErrInfo.hh" // required by XrdSsiStream -#include "XrdSsi/XrdSsiStream.hh" - -namespace lsst::qserv { -namespace wbase { -class Task; -} -namespace wcontrol { -class WorkerStats; -} -} // namespace lsst::qserv - -namespace lsst::qserv::xrdsvc { - -/// StreamBuffer is a single use buffer for transferring data packets -/// to XrdSsi. -/// Its notable feature is the Recycle() function, which XrdSsi will -/// promptly call when it no longer needs the buffer. -class StreamBuffer : public XrdSsiStream::Buffer { -public: - using Ptr = std::shared_ptr; - - // Copying this would be very confusing for something waiting for Recycle(). - StreamBuffer() = delete; - StreamBuffer(StreamBuffer const &) = delete; - StreamBuffer &operator=(StreamBuffer const &) = delete; - - /// Factory function, because this should be able to delete itself when Recycle() is called. - /// The constructor uses move to avoid copying the string. - static StreamBuffer::Ptr createWithMove(std::string &input, - std::shared_ptr const &task = nullptr); - - /// Set the maximum number of bytes that can be used by all instances of this class. - static void setMaxTotalBytes(int64_t maxBytes); - - /// @return the percent of totalBytes used out of _maxTotalByes. - static double percentOfMaxTotalBytesUsed(); - - size_t getSize() const { return _dataStr.size(); } - - /// @Return total number of bytes used by ALL StreamBuffer objects. - static size_t getTotalBytes() { return _totalBytes; } - - /// Call to recycle the buffer when finished (normally called by XrdSsi). - void Recycle() override; - - /// Wait until Recycle() is called. - /// @return true if there is data in the buffer. - bool waitForDoneWithThis(); - - /// Start the timer that will be stopped when Recycle() is called. - void startTimer(); - - /// Unblock the condition variable on cancel. - void cancel(); - - ~StreamBuffer() override; - -private: - /// This constructor will invalidate 'input'. - explicit StreamBuffer(std::string &input, std::shared_ptr const &task); - - /// Pointer to the task for keeping statistics. - /// NOTE: This will be nullptr for many things, so check before using. - std::shared_ptr _task; - std::string _dataStr; - std::mutex _mtx; - std::condition_variable _cv; - bool _doneWithThis = false; - bool _cancelled = false; - Ptr _selfKeepAlive; ///< keep this object alive until after Recycle() is called. - // util::InstanceCount _ic{"StreamBuffer"}; ///< Useful as it indicates amount of waiting for czar. - - std::chrono::time_point _createdTime; ///< Time this instance was created. - std::chrono::time_point - _startTime; ///< Time this instance was handed to xrootd. - std::chrono::time_point - _endTime; ///< Time xrootd was finished with this instance. - /// Pointer for worker statistics. - /// NOTE: This will be nullptr for many things, so check before using. - std::shared_ptr _wStats; - - // Members associated with limiting memory use. - static std::atomic _totalBytes; ///< Total bytes currently in use by all StreamBuffer instances. - static std::atomic _maxTotalBytes; - static std::mutex _createMtx; - static std::condition_variable _createCv; -}; - -} // namespace lsst::qserv::xrdsvc - -#endif // LSST_QSERV_XRDSVC_STREAMBUFFER_H From 43a894e640ab08edbf4c47e8a19d747b76e98663 Mon Sep 17 00:00:00 2001 From: John Gates Date: Thu, 19 Sep 2024 15:30:21 -0700 Subject: [PATCH 07/22] Added query retries. --- .../templates/proxy/etc/qserv-czar.cnf.jinja | 2 +- src/ccontrol/UserQuerySelect.cc | 9 +- src/czar/ActiveWorker.cc | 12 +- src/czar/ActiveWorker.h | 5 - src/czar/Czar.cc | 47 ++-- src/czar/Czar.h | 8 + src/czar/CzarRegistry.cc | 2 +- src/czar/CzarRegistry.h | 6 +- src/http/WorkerQueryStatusData.h | 6 +- src/qdisp/CMakeLists.txt | 1 - src/qdisp/CzarStats.cc | 6 +- src/qdisp/CzarStats.h | 11 +- src/qdisp/Executive.cc | 46 ++- src/qdisp/Executive.h | 15 +- src/qdisp/JobBase.h | 2 +- src/qdisp/JobQuery.cc | 2 +- src/qdisp/JobQuery.h | 6 +- src/qdisp/QdispPool.cc | 263 ------------------ src/qdisp/QdispPool.h | 205 -------------- src/qdisp/SharedResources.h | 16 +- src/qdisp/UberJob.cc | 7 +- src/qdisp/UberJob.h | 10 +- src/qdisp/testQDisp.cc | 22 +- src/util/CMakeLists.txt | 1 + src/util/xrootd.cc | 2 + src/util/xrootd.h | 2 + src/wbase/FileChannelShared.cc | 101 +------ src/wbase/FileChannelShared.h | 1 - src/wbase/Task.cc | 3 +- src/wbase/UberJobData.cc | 181 ++++++++---- src/wbase/UberJobData.h | 74 ++++- src/wbase/UserQueryInfo.cc | 16 +- src/wbase/UserQueryInfo.h | 15 +- src/wcontrol/CMakeLists.txt | 1 + src/wcontrol/Foreman.cc | 50 +++- src/wcontrol/Foreman.h | 29 +- src/wpublish/QueriesAndChunks.cc | 26 +- src/wpublish/QueriesAndChunks.h | 14 +- src/wpublish/QueryStatistics.cc | 4 +- src/wpublish/QueryStatistics.h | 7 +- src/xrdsvc/HttpWorkerCzarModule.cc | 28 +- src/xrdsvc/SsiService.cc | 3 + 42 files changed, 500 insertions(+), 767 deletions(-) delete mode 100644 src/qdisp/QdispPool.cc delete mode 100644 src/qdisp/QdispPool.h diff --git a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja index 5ed3b4230..7991d0ab0 100644 --- a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja +++ b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja @@ -98,7 +98,7 @@ notifyWorkersOnCzarRestart = 1 #[debug] #chunkLimit = -1 -# Please see qdisp/QdispPool.h QdispPool::QdispPool for more information +# Please see util/QdispPool.h QdispPool::QdispPool for more information [qdisppool] #size of the pool poolSize = 50 diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index b432ddd15..6ed20d896 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -71,7 +71,6 @@ // Third-party headers #include -#include "qdisp/QdispPool.h" // LSST headers #include "lsst/log/Log.h" @@ -107,6 +106,7 @@ #include "sql/Schema.h" #include "util/Bug.h" #include "util/IterableFormatter.h" +#include "util/QdispPool.h" #include "util/ThreadPriority.h" #include "qdisp/UberJob.h" @@ -326,6 +326,7 @@ void UserQuerySelect::submit() { } void UserQuerySelect::buildAndSendUberJobs() { + // &&& NEED CODE - this function should check if the worker is DEAD. TODO:UJ string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); LOGS(_log, LOG_LVL_DEBUG, funcN << " start"); @@ -451,9 +452,9 @@ void UserQuerySelect::buildAndSendUberJobs() { LOGS(_log, LOG_LVL_ERROR, errStr); } - // Add worker contact info to UberJobs. - //&&& auto const wContactMap = czRegistry->getWorkerContactMap(); - auto const wContactMap = czRegistry->waitForWorkerContactMap(); //&&&Z + // Add worker contact info to UberJobs. The czar can't do anything without + // the contact map, so it will wait. This should only ever be an issue at startup. + auto const wContactMap = czRegistry->waitForWorkerContactMap(); LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); for (auto const& [wIdKey, ujVect] : workerJobMap) { auto iter = wContactMap->find(wIdKey); diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index a5a745c2e..604720cd0 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -31,6 +31,7 @@ #include "http/Client.h" #include "http/MetaModule.h" #include "util/common.h" +#include "util/QdispPool.h" // LSST headers #include "lsst/log/Log.h" @@ -154,14 +155,21 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti // &&& Maybe only send the status message if the lists are not empty ??? // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) // put this in a different function and start the thread.&&&; - _sendStatusMsg(wInfo_, jsWorkerReqPtr); + //&&& _sendStatusMsg(wInfo_, jsWorkerReqPtr); + Ptr thisPtr = shared_from_this(); + auto sendStatusMsgFunc = [thisPtr, wInfo_, jsWorkerReqPtr](util::CmdData*) { + thisPtr->_sendStatusMsg(wInfo_, jsWorkerReqPtr); + }; + + auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(sendStatusMsgFunc)); + auto qdisppool = czar::Czar::getCzar()->getQdispPool(); + qdisppool->queCmd(cmd, 1); } void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, std::shared_ptr const& jsWorkerReqPtr) { auto& jsWorkerReq = *jsWorkerReqPtr; auto const method = http::Method::POST; - //&&&auto const wInf = _wqsData->getWInfo(); if (wInf == nullptr) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " wInfo was null."); return; diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index f02ca1a63..b376bb13a 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -163,11 +163,6 @@ class ActiveWorker : public std::enable_shared_from_this { std::atomic _conThreadCount{0}; int _maxConThreadCount{2}; - /* &&& - /// &&& doc - /// @throws std::invalid_argument - bool _parse(nlohmann::json const& jsWorkerReq); // &&& delete after basic testing - */ }; /// &&& doc diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index 260e59998..2b61d33d7 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -56,7 +56,6 @@ #include "proto/worker.pb.h" #include "qdisp/CzarStats.h" #include "qdisp/Executive.h" -#include "qdisp/QdispPool.h" #include "qdisp/SharedResources.h" #include "qproc/DatabaseModels.h" #include "rproc/InfileMerger.h" @@ -66,6 +65,7 @@ #include "util/common.h" #include "util/FileMonitor.h" #include "util/IterableFormatter.h" +#include "util/QdispPool.h" #include "util/String.h" using namespace lsst::qserv; @@ -97,12 +97,9 @@ void Czar::_monitor() { /// Check database for changes in worker chunk assignments and aliveness _czarFamilyMap->read(); - // old TODO:UJ DM-45470 If there were changes in `_czarFamilyMap`, - // see if any workers went down. If any did, `_unassign` all - // Jobs in UberJobs for the downed workers. The `_unassigned` - // Jobs should get reassigned in the next section `assignJobsToUberJobs`. - - // &&& Send appropriate messages to all ActiveWorkers + // Send appropriate messages to all ActiveWorkers. This will + // check if workers have died by timeout. The reponse + // from the worker include _czarRegistry->sendActiveWorkersMessages(); /// Create new UberJobs (if possible) for all jobs that are @@ -111,6 +108,7 @@ void Czar::_monitor() { { // Make a copy of all valid Executives lock_guard execMapLock(_executiveMapMtx); + // Use an iterator so it's easy/quick to delete dead weak pointers. auto iter = _executiveMap.begin(); while (iter != _executiveMap.end()) { auto qIdKey = iter->first; @@ -129,23 +127,15 @@ void Czar::_monitor() { } // TODO:UJ DM-45470 Maybe get missing results from workers. - // This would be files that workers sent messages to the czar to - // collect, but there was a communication problem and the czar didn't get the message - // or didn't collect the file. to retrieve complete files that haven't been - // collected. - // Basically, is there a reasonable way to check that all UberJobs are being handled - // and nothing has fallen through the cracks? - - // TODO:UJ Maybe send a list of cancelled and completed queries to the workers? - // How long should queryId's remain on this list? - // It's probably better to have the executive for a query to send out - // messages to worker that a user query was cancelled. If a worker sends - // the czar about a cancelled user query, or the executive for that - // query cannot be found, the worker should cancel all Tasks associated - // with that queryId. - // &&& Go through the ActiveWorkerMap. Each ActiveWorker instance has a list of QueryIds - // that have not yet been acknowledged by the worker, so send a message to each worker - // with that list. + // To prevent anything from slipping through the cracks: + // Workers will keep trying to transmit results until they think the czar is dead. + // If a worker thinks the czar died, it will cancel all related jobs that it has, + // and if the czar sends a status message to that worker, that worker will send back + // a separate message saying it killed everything that this czar gave it. Upon + // getting this message from a worker, this czar will reassign everything it had + // sent to that worker. + + // TODO:UJ How long should queryId's remain on this list? } } @@ -202,11 +192,12 @@ Czar::Czar(string const& configFilePath, string const& czarName) << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " << util::prettyCharList(vectMinRunningSizes)); - qdisp::QdispPool::Ptr qdispPool = - make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); - qdisp::CzarStats::setup(qdispPool); + _qdispPool = + make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); + + qdisp::CzarStats::setup(_qdispPool); - _qdispSharedResources = qdisp::SharedResources::create(qdispPool); + _qdispSharedResources = qdisp::SharedResources::create(_qdispPool); int xrootdCBThreadsMax = _czarConfig->getXrootdCBThreadsMax(); int xrootdCBThreadsInit = _czarConfig->getXrootdCBThreadsInit(); diff --git a/src/czar/Czar.h b/src/czar/Czar.h index 35a1088d7..c8dc221c6 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -150,6 +150,8 @@ class Czar { /// &&& doc void killIncompleteUbjerJobsOn(std::string const& workerId); + std::shared_ptr getQdispPool() const { return _qdispPool; } + /// Startup time of czar, sent to workers so they can detect that the czar was /// was restarted when this value changes. static uint64_t const czarStartupTime; @@ -230,7 +232,13 @@ class Czar { /// Wait time between checks. TODO:UJ set from config std::chrono::milliseconds _monitorSleepTime{15000}; + /// Keeps track of all workers (alive or otherwise) that this czar + /// may communicate with. Once created, the pointer never changes. std::shared_ptr _activeWorkerMap; + + /// A combined priority queue and thread pool to regulate czar communications + /// with workers. Once created, the pointer never changes. + std::shared_ptr _qdispPool; }; } // namespace lsst::qserv::czar diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 5ef8748d6..6f9275b71 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -216,7 +216,7 @@ void CzarRegistry::sendActiveWorkersMessages() { _activeWorkerMap.sendActiveWorkersMessages(); } -void CzarRegistry::endUserQuery(QueryId qId, bool deleteWorkerResults) { +void CzarRegistry::endUserQueryOnWorkers(QueryId qId, bool deleteWorkerResults) { lock_guard lck(_mapMtx); // Add query id to the appropriate list. if (deleteWorkerResults) { diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index 302b5a3f0..076f7fd40 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -82,8 +82,10 @@ class CzarRegistry { /// &&& doc void sendActiveWorkersMessages(); - /// &&& doc - void endUserQuery(QueryId qId, bool deleteWorkerResults); + /// Add the query id to the list of queries to end on workers and + /// send the messages, deleting all result files if + /// `deleteWorkerResults` is true. + void endUserQueryOnWorkers(QueryId qId, bool deleteWorkerResults); private: CzarRegistry() = delete; diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index 21b3fe448..c56c148b0 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -106,11 +106,7 @@ class WorkerContactInfo { std::string cName(const char* fn) { return std::string("WorkerContactInfo::") + fn; } - // &&&QM &&&HERE should all of these be constant??? - std::string const wId; ///< key - //&&&std::string const wHost; ///< "host-addr" entry. - //&&&std::string const wManagementHost; ///< "management-host-name" entry. - //&&&int const wPort; ///< "management-port" entry. + std::string const wId; ///< key, this is the one thing that cannot change. std::string getWHost() const { std::lock_guard lg(_rMtx); diff --git a/src/qdisp/CMakeLists.txt b/src/qdisp/CMakeLists.txt index 2bc919dd4..fc3193ba4 100644 --- a/src/qdisp/CMakeLists.txt +++ b/src/qdisp/CMakeLists.txt @@ -8,7 +8,6 @@ target_sources(qdisp PRIVATE JobBase.cc JobDescription.cc JobQuery.cc - QdispPool.cc UberJob.cc ) diff --git a/src/qdisp/CzarStats.cc b/src/qdisp/CzarStats.cc index 0d39232c5..ca741e83c 100644 --- a/src/qdisp/CzarStats.cc +++ b/src/qdisp/CzarStats.cc @@ -29,8 +29,8 @@ // Qserv headers #include "cconfig/CzarConfig.h" -#include "qdisp/QdispPool.h" #include "util/Bug.h" +#include "util/QdispPool.h" #include "util/TimeUtils.h" // LSST headers @@ -48,7 +48,7 @@ namespace lsst::qserv::qdisp { CzarStats::Ptr CzarStats::_globalCzarStats; util::Mutex CzarStats::_globalMtx; -void CzarStats::setup(qdisp::QdispPool::Ptr const& qdispPool) { +void CzarStats::setup(util::QdispPool::Ptr const& qdispPool) { std::lock_guard lg(_globalMtx); if (_globalCzarStats != nullptr || qdispPool == nullptr) { throw util::Bug(ERR_LOC, "Error CzarStats::setup called after global pointer set or qdispPool=null."); @@ -56,7 +56,7 @@ void CzarStats::setup(qdisp::QdispPool::Ptr const& qdispPool) { _globalCzarStats = Ptr(new CzarStats(qdispPool)); } -CzarStats::CzarStats(qdisp::QdispPool::Ptr const& qdispPool) +CzarStats::CzarStats(util::QdispPool::Ptr const& qdispPool) : _qdispPool(qdispPool), _startTimeMs(util::TimeUtils::now()) { auto bucketValsRates = {128'000.0, 512'000.0, 1'024'000.0, 16'000'000.0, 128'000'000.0, 256'000'000.0, 512'000'000.0, 768'000'000.0, diff --git a/src/qdisp/CzarStats.h b/src/qdisp/CzarStats.h index 6a2c10ef2..db80be7ec 100644 --- a/src/qdisp/CzarStats.h +++ b/src/qdisp/CzarStats.h @@ -44,9 +44,12 @@ // Third party headers #include -namespace lsst::qserv::qdisp { +namespace lsst::qserv::util { class QdispPool; +} + +namespace lsst::qserv::qdisp { /// This class is used to track statistics for the czar. /// setup() needs to be called before get(). @@ -79,7 +82,7 @@ class CzarStats : std::enable_shared_from_this { /// Setup the global CzarStats instance /// @throws Bug if global has already been set or qdispPool is null. - static void setup(std::shared_ptr const& qdispPool); + static void setup(std::shared_ptr const& qdispPool); /// Return a pointer to the global CzarStats instance. /// @throws Bug if get() is called before setup() @@ -206,13 +209,13 @@ class CzarStats : std::enable_shared_from_this { nlohmann::json getTransmitStatsJson() const; private: - CzarStats(std::shared_ptr const& qdispPool); + CzarStats(std::shared_ptr const& qdispPool); static Ptr _globalCzarStats; ///< Pointer to the global instance. static util::Mutex _globalMtx; ///< Protects `_globalCzarStats` /// Connection to get information about the czar's pool of dispatch threads. - std::shared_ptr _qdispPool; + std::shared_ptr _qdispPool; /// The start up time (milliseconds since the UNIX EPOCH) of the status collector. uint64_t const _startTimeMs = 0; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index b9a48145c..821673d25 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -74,6 +74,7 @@ #include "util/AsyncTimer.h" #include "util/Bug.h" #include "util/EventThread.h" +#include "util/QdispPool.h" using namespace std; @@ -226,7 +227,7 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { return jobQuery; } -void Executive::queueJobStart(PriorityCommand::Ptr const& cmd) { +void Executive::queueJobStart(util::PriorityCommand::Ptr const& cmd) { _jobStartCmdList.push_back(cmd); if (_scanInteractive) { _qdispPool->queCmd(cmd, 0); @@ -235,7 +236,7 @@ void Executive::queueJobStart(PriorityCommand::Ptr const& cmd) { } } -void Executive::queueFileCollect(PriorityCommand::Ptr const& cmd) { +void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { // &&& put file collect in the pool ??? if (_scanInteractive) { _qdispPool->queCmd(cmd, 3); } else { @@ -244,20 +245,15 @@ void Executive::queueFileCollect(PriorityCommand::Ptr const& cmd) { } void Executive::runUberJob(std::shared_ptr const& uberJob) { - /// TODO:UJ delete useqdisppool, only set to false if problems during testing - bool const useqdisppool = true; - if (useqdisppool) { - auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; - - auto cmd = qdisp::PriorityCommand::Ptr(new qdisp::PriorityCommand(runUberJobFunc)); - _jobStartCmdList.push_back(cmd); - if (_scanInteractive) { - _qdispPool->queCmd(cmd, 0); - } else { - _qdispPool->queCmd(cmd, 1); - } + + auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; + + auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); + _jobStartCmdList.push_back(cmd); + if (_scanInteractive) { + _qdispPool->queCmd(cmd, 0); } else { - uberJob->runUberJob(); + _qdispPool->queCmd(cmd, 1); } } @@ -476,7 +472,7 @@ void Executive::squash() { // Any message to this czar about this query should result in an error sent back to // the worker as soon it can't locate an executive or the executive says cancelled. bool const deleteResults = true; - sendWorkerCancelMsg(deleteResults); + sendWorkersEndMsg(deleteResults); LOGS(_log, LOG_LVL_DEBUG, "Executive::squash done"); } @@ -506,24 +502,14 @@ void Executive::_squashSuperfluous() { } bool const keepResults = false; - sendWorkerCancelMsg(keepResults); + sendWorkersEndMsg(keepResults); LOGS(_log, LOG_LVL_DEBUG, "Executive::squashSuperfluous done"); } -void Executive::sendWorkerCancelMsg(bool deleteResults) { // &&&QM rename sendEndMsgs - // TODO:UJ need to send a message to the worker that the query is cancelled and all result files - // should be delete - // &&&QM - // TODO:UJ &&& worker needs to monitor registry to see if czar dies - // &&& - worker will need to kill related queries/uberjobs and store info to send to the - // &&& dead czar in case it comes back to life. - LOGS(_log, LOG_LVL_ERROR, - "TODO:UJ NEED CODE Executive::sendWorkerCancelMsg to send messages to workers to cancel this czarId " - "+ " - "queryId. " +void Executive::sendWorkersEndMsg(bool deleteResults) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " terminating this query deleteResults=" << deleteResults); - - czar::Czar::getCzar()->getCzarRegistry()->endUserQuery(_id, deleteResults); // &&&QM + czar::Czar::getCzar()->getCzarRegistry()->endUserQueryOnWorkers(_id, deleteResults); } int Executive::getNumInflight() const { diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index b954c3313..c60261238 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -42,7 +42,6 @@ #include "qdisp/JobDescription.h" #include "qdisp/ResponseHandler.h" #include "qdisp/SharedResources.h" -#include "qdisp/QdispPool.h" #include "qdisp/UberJob.h" #include "qmeta/JobStatus.h" #include "util/EventThread.h" @@ -80,6 +79,8 @@ class InfileMerger; namespace util { class AsyncTimer; +class PriorityCommand; +class QdispPool; } namespace qdisp { @@ -133,10 +134,10 @@ class Executive : public std::enable_shared_from_this { void runUberJob(std::shared_ptr const& uberJob); /// Queue a job to be sent to a worker so it can be started. - void queueJobStart(PriorityCommand::Ptr const& cmd); + void queueJobStart(std::shared_ptr const& cmd); // &&& delete ??? /// Queue `cmd`, using the QDispPool, so it can be used to collect the result file. - void queueFileCollect(PriorityCommand::Ptr const& cmd); + void queueFileCollect(std::shared_ptr const& cmd); // &&& delete ??? /// Waits for all jobs on _jobStartCmdList to start. This should not be called /// before ALL jobs have been added to the pool. @@ -174,7 +175,7 @@ class Executive : public std::enable_shared_from_this { /// @return true if cancelled bool getCancelled() { return _cancelled; } - std::shared_ptr getQdispPool() { return _qdispPool; } + std::shared_ptr getQdispPool() { return _qdispPool; } /// Add 'rowCount' to the total number of rows in the result table. void addResultRows(int64_t rowCount); @@ -222,7 +223,7 @@ class Executive : public std::enable_shared_from_this { /// Send a message to all workers to cancel this query. /// @param deleteResults - If true, delete all result files for this query on the workers. - void sendWorkerCancelMsg(bool deleteResults); + void sendWorkersEndMsg(bool deleteResults); /// &&& doc void killIncompleteUberJobsOn(std::string const& restartedWorkerId); @@ -264,9 +265,9 @@ class Executive : public std::enable_shared_from_this { /// How many jobs are used in this query. 1 avoids possible 0 of 0 jobs completed race condition. /// The correct value is set when it is available. std::atomic _totalJobs{1}; - QdispPool::Ptr _qdispPool; ///< Shared thread pool for handling commands to and from workers. + std::shared_ptr _qdispPool; ///< Shared thread pool for handling commands to and from workers. - std::deque _jobStartCmdList; ///< list of jobs to start. + std::deque> _jobStartCmdList; ///< list of jobs to start. /** Execution errors */ util::MultiError _multiError; diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index 88ac1fa98..b6b18d325 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -58,7 +58,7 @@ class JobBase : public std::enable_shared_from_this { virtual QueryId getQueryId() const = 0; virtual UberJobId getJobId() const = 0; virtual std::string const& getIdStr() const = 0; - virtual std::shared_ptr getQdispPool() = 0; + //&&&virtual std::shared_ptr getQdispPool() = 0; //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for // xrootd virtual std::shared_ptr getRespHandler() = 0; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index ad28f5c7e..85c2b4efc 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -53,7 +53,7 @@ JobQuery::JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& j _jobStatus(jobStatus), _qid(qid), _idStr(QueryIdHelper::makeIdStr(qid, getJobId())) { - _qdispPool = executive->getQdispPool(); + //&&&_qdispPool = executive->getQdispPool(); LOGS(_log, LOG_LVL_TRACE, "JobQuery desc=" << _jobDescription); } diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index 802cc44fc..c6fcc0829 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -40,7 +40,7 @@ namespace lsst::qserv::qdisp { -class QdispPool; +//&&&class QdispPool; class QueryRequest; /// This class is used to describe, monitor, and control a single query to a worker. @@ -78,7 +78,7 @@ class JobQuery : public JobBase { std::shared_ptr getExecutive() override { return _executive.lock(); } - std::shared_ptr getQdispPool() override { return _qdispPool; } + //&&&std::shared_ptr getQdispPool() override { return _qdispPool; } std::ostream& dumpOS(std::ostream& os) const override; @@ -148,7 +148,7 @@ class JobQuery : public JobBase { // Cancellation std::atomic _cancelled{false}; ///< Lock to make sure cancel() is only called once. - std::shared_ptr _qdispPool; + //&&& std::shared_ptr _qdispPool; /// The UberJobId that this job is assigned to. Values less than zero /// indicate this job is unassigned. To prevent race conditions, diff --git a/src/qdisp/QdispPool.cc b/src/qdisp/QdispPool.cc deleted file mode 100644 index 137e59a34..000000000 --- a/src/qdisp/QdispPool.cc +++ /dev/null @@ -1,263 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2018 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "qdisp/QdispPool.h" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "util/Bug.h" -#include "util/common.h" - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.QdispPool"); -} - -namespace lsst::qserv::qdisp { - -///< @Return true if the queue could be added. -bool PriorityQueue::addPriQueue(int priority, int minRunning, int maxRunning) { - std::lock_guard lock(_mtx); - auto q = std::make_shared(priority, minRunning, maxRunning); - // std::pair item(priority, q); - auto item = std::make_pair(priority, q); - auto ret = _queues.insert(item); - if (!ret.second) { - LOGS(_log, LOG_LVL_ERROR, "Failed addPriQueue priority=" << priority << " minRunning=" << minRunning); - } - return ret.second; -} - -/// The pool needs to be able to place commands in this queue for shutdown. -void PriorityQueue::queCmd(util::Command::Ptr const& cmd) { - { - std::lock_guard lock(_mtx); - auto iter = _queues.find(_defaultPriority); - if (iter == _queues.end()) { - throw util::Bug(ERR_LOC, "PriorityQueue default priority queue not found a!"); - } - iter->second->queCmd(cmd); - _changed = true; - } - _cv.notify_one(); -} - -void PriorityQueue::queCmd(PriorityCommand::Ptr const& cmd, int priority) { - { - std::lock_guard lock(_mtx); - auto iter = _queues.find(priority); - if (iter == _queues.end()) { - // give it the default priority - LOGS(_log, LOG_LVL_WARN, - "queCmd invalid priority=" << priority << " using default priority=" << _defaultPriority); - iter = _queues.find(_defaultPriority); - if (iter == _queues.end()) { - throw util::Bug(ERR_LOC, "PriorityQueue default priority queue not found b!"); - } - } - cmd->_priority = priority; - iter->second->queCmd(cmd); - LOGS(_log, LOG_LVL_DEBUG, "priQue p=" << priority << _statsStr()); - _changed = true; - } - _cv.notify_one(); -} - -std::atomic localLogLimiter(0); - -util::Command::Ptr PriorityQueue::getCmd(bool wait) { - util::Command::Ptr ptr; - std::unique_lock uLock(_mtx); - while (true) { - _changed = false; - ++localLogLimiter; - // Log this every once in while to INFO so there's some idea of system - // load without generating crushing amounts of log messages. - if (localLogLimiter % 500 == 0) { - LOGS(_log, LOG_LVL_INFO, "priQueGet " << _statsStr()); - } else { - LOGS(_log, LOG_LVL_DEBUG, "priQueGet " << _statsStr()); - } - - /// Make sure minimum number of jobs running per priority. - if (!_shuttingDown) { - // If shutting down, this could prevent all jobs from completing. - // Goes from highest to lowest priority queue - for (auto const& elem : _queues) { - PriQ::Ptr const& que = elem.second; - if (que->running < que->getMinRunning()) { - ptr = que->getCmd(false); // no wait - if (ptr != nullptr) { - return ptr; - } - } - } - } - - // Since all the minimums are met, just run the first command found. - for (auto const& elem : _queues) { - PriQ::Ptr const& que = elem.second; - // If this queue has no running threads, or - if (que->running < que->getMaxRunning()) { - ptr = que->getCmd(false); // no wait - if (ptr != nullptr) { - _changed = true; - _cv.notify_one(); - return ptr; - } - } - } - - // If nothing was found, wait or return nullptr. - if (wait) { - LOGS(_log, LOG_LVL_DEBUG, "getCmd wait " << _statsStr()); - _cv.wait(uLock, [this]() { return _changed; }); - } else { - return ptr; - } - } -} - -void PriorityQueue::prepareShutdown() { - std::lock_guard lock(_mtx); - _shuttingDown = true; -} - -void PriorityQueue::_incrDecrRunningCount(util::Command::Ptr const& cmd, int incrDecr) { - std::lock_guard lock(_mtx); - PriorityCommand::Ptr priCmd = std::dynamic_pointer_cast(cmd); - if (priCmd != nullptr) { - int priority = priCmd->_priority; - auto iter = _queues.find(priority); - if (iter != _queues.end()) { - iter->second->running += incrDecr; - return; - } - } else if (cmd != nullptr) { - // Non-PriorityCommands go on the default queue. - auto iter = _queues.find(_defaultPriority); - if (iter != _queues.end()) { - iter->second->running += incrDecr; - } - } -} - -void PriorityQueue::commandStart(util::Command::Ptr const& cmd) { - // Increase running count by 1 - _incrDecrRunningCount(cmd, 1); -} - -void PriorityQueue::commandFinish(util::Command::Ptr const& cmd) { - // Reduce running count by 1 - _incrDecrRunningCount(cmd, -1); -} - -std::vector PriorityQueue::stats() const { - std::lock_guard const lock(_mtx); - return _stats(); -} - -std::vector PriorityQueue::_stats() const { - std::vector result; - for (auto const& elem : _queues) { - PriQ::Ptr const& queue = elem.second; - result.push_back(queue->stats()); - } - return result; -} - -std::string PriorityQueue::_statsStr() const { - std::stringstream os; - for (auto const& queueStats : _stats()) { - os << "(pr=" << queueStats.priority << ":sz=" << queueStats.size << ":r=" << queueStats.running - << ")"; - } - return os.str(); -} - -nlohmann::json PriorityQueue::getJson() const { - std::lock_guard const lock(_mtx); - nlohmann::json jsArray = nlohmann::json::array(); - for (auto const& queueStats : _stats()) { - nlohmann::json js; - js["priority"] = queueStats.priority; - js["size"] = queueStats.size; - js["running"] = queueStats.running; - jsArray.push_back(js); - } - return jsArray; -} - -QdispPool::QdispPool(int poolSize, int largestPriority, std::vector const& maxRunSizes, - std::vector const& minRunningSizes) { - std::stringstream os; - os << "poolSize(max " << maxPoolSize() << ")=" << poolSize << " maxPriority(1 to " - << defaultPriority() - 2 << ")=" << largestPriority - << " maxRunSizes=" << util::prettyCharList(maxRunSizes) - << " minRunningSizes=" << util::prettyCharList(minRunningSizes); - if (poolSize < 1 || poolSize > maxPoolSize() || largestPriority < 0 || - maxRunSizes.size() < static_cast(largestPriority) + 1 || - largestPriority > defaultPriority() - 2) { - LOGS(_log, LOG_LVL_ERROR, "QdispPool invalid paramater " << os.str()); - throw std::invalid_argument(os.str()); - } - - LOGS(_log, LOG_LVL_INFO, "QdispPool creating " << os.str()); - _prQueue = std::make_shared(defaultPriority(), 1, 1); // default (lowest) priority. - for (unsigned int pri = 0; pri <= static_cast(largestPriority); ++pri) { - size_t const minRun = minRunningSizes.size() > pri ? minRunningSizes[pri] : 1; - size_t const maxRun = maxRunSizes.size() > pri ? maxRunSizes[pri] : 1; - LOGS(_log, LOG_LVL_INFO, "creating priQ pri=" << pri << " min=" << minRun << " max=" << maxRun); - _prQueue->addPriQueue(pri, minRun, maxRun); - } - // This pool does not kick threads out when they take time (but little CPU) to process, - // so maxPoolThreads is just slightly larger than poolSize. - _pool = util::ThreadPool::newThreadPool(poolSize, _prQueue); -} - -QdispPool::QdispPool(bool unitTest) { - if (not unitTest) { - std::string msg( - "QdispPool::QdispPool(bool unitTest) " - "This constructor is only meant for use with unit tests."); - LOGS(_log, LOG_LVL_ERROR, - "QdispPool::QdispPool(bool unitTest) This constructor is only meant for use with unit tests."); - throw std::invalid_argument(msg); - } else { - _prQueue = std::make_shared(100, 1, 1); // default (lowest) priority. - unsigned int poolSize = 50; - _pool = util::ThreadPool::newThreadPool(poolSize, _prQueue); - _prQueue->addPriQueue(0, 1, 3); // Highest priority - interactive queries - _prQueue->addPriQueue(1, 1, 3); // Outgoing shared scan queries. - _prQueue->addPriQueue(2, 1, 3); // FAST queries (Object table) - _prQueue->addPriQueue(3, 1, 3); // MEDIUM queries (Source table) - _prQueue->addPriQueue(4, 1, 3); // SLOW queries (Object Extra table) - _prQueue->addPriQueue(5, 1, 3); // FAST large results - _prQueue->addPriQueue(6, 1, 3); // MEDIUM large results - _prQueue->addPriQueue(7, 1, 3); // Everything else (slow things) - } -} - -} // namespace lsst::qserv::qdisp diff --git a/src/qdisp/QdispPool.h b/src/qdisp/QdispPool.h deleted file mode 100644 index d3e1af774..000000000 --- a/src/qdisp/QdispPool.h +++ /dev/null @@ -1,205 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2018 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_QDISP_QDISPPOOL_H -#define LSST_QSERV_QDISP_QDISPPOOL_H - -// System headers -#include -#include - -// Third-party headers -#include - -// Qserv headers -#include "util/ThreadPool.h" - -namespace lsst::qserv::qdisp { - -class PriorityQueue; - -class PriorityCommand : public util::CommandTracked { -public: - using Ptr = std::shared_ptr; - PriorityCommand() = default; - explicit PriorityCommand(std::function func) : CommandTracked(func) {} - ~PriorityCommand() override = default; - friend PriorityQueue; - -private: - int _priority{0}; // Need to know what queue this was placed on. -}; - -/// FIFO priority queue. Elements with the same priority are handled in -/// a FIFO manner. Lower integer values are higher priority. -/// Higher priority queues get asked first when a thread becomes available -/// but the system reserves room so that each priority has at least -/// a minimum number of threads running. -class PriorityQueue : public util::CommandQueue { -public: - using Ptr = std::shared_ptr; - - /// A queue for handling all messages of a given priority. - class PriQ : public util::CommandQueue { - public: - using Ptr = std::shared_ptr; - - /// A snapshot status of the queue for logging or monitoring purposes. - struct Stats { - Stats(int priority_, size_t size_, int running_) - : priority(priority_), size(size_), running(running_) {} - int priority; - size_t size; - int running; - }; - - explicit PriQ(int priority, int minRunning, int maxRunning) - : _priority(priority), _minRunning(minRunning), _maxRunning(maxRunning) {} - ~PriQ() override = default; - int getPriority() const { return _priority; } - int getMinRunning() const { return _minRunning; } - int getMaxRunning() const { return _maxRunning; } - - Stats stats() const { return Stats(_priority, const_cast(this)->size(), running); } - - std::atomic running{0}; ///< number of jobs of this priority currently running. - private: - int const _priority; ///< priority value of this queue - int const _minRunning; ///< minimum number of threads (unless nothing on this queue to run) - int const _maxRunning; ///< maximum number of threads for this PriQ to use. - }; - - PriorityQueue() = delete; - PriorityQueue(PriorityQueue const&) = delete; - PriorityQueue& operator=(PriorityQueue const&) = delete; - - PriorityQueue(int defaultPriority, int minRunning, int maxRunning) : _defaultPriority(defaultPriority) { - _queues[_defaultPriority] = std::make_shared(_defaultPriority, minRunning, maxRunning); - } - - ///< @Return true if the queue could be added. - bool addPriQueue(int priority, int minRunning, int spareThreads); - - /// The pool needs to be able to place commands in this queue for shutdown. - void queCmd(util::Command::Ptr const& cmd) override; - - void queCmd(PriorityCommand::Ptr const& cmd, int priority); - - util::Command::Ptr getCmd(bool wait = true) override; - void prepareShutdown(); - - void commandStart(util::Command::Ptr const& cmd) override; - void commandFinish(util::Command::Ptr const& cmd) override; - - /// @return a snapshot of statistics for all queues (one element per queue) - std::vector stats() const; - - /// @return a json object with queue information. - nlohmann::json getJson() const; - -private: - /// @note a lock on _mtx must be held before calling the method - /// @return a snapshot of statistics for all queues (one element per queue) - std::vector _stats() const; - - /// @note a lock on _mtx must be held before calling the method - /// @return the stringified representation of the statistics for all queues - std::string _statsStr() const; - - void _incrDecrRunningCount(util::Command::Ptr const& cmd, int incrDecr); - - mutable std::mutex _mtx; - std::condition_variable _cv; - bool _shuttingDown{false}; - bool _changed{false}; - - std::map _queues; - int _defaultPriority{1}; -}; - -/// This class is used to provide a pool of threads for handling out going -/// and incoming messages from xrootd as well as a system for prioritizing -/// the messages. -/// This has not worked entirely as intended. Reducing the number of threads -/// had negative impacts on xrootd, but other changes have been made such that -/// reducing the size of the thread pools can be tried again. -/// What it does do is prioritize out going messages (typically jobs going to -/// workers), allow interactive queries to be handled quickly, even under -/// substantial loads, and it gives a good idea of how busy the czar really -/// is. Large numbers of queued items in any of the scan queries, or large -/// results would be good indicators to avoid giving a particular czar more -/// user queries. -/// -class QdispPool { -public: - typedef std::shared_ptr Ptr; - - /// Default priority, the lowest possible priority. - static int defaultPriority() { return 100; } - /// This should be more than enough. - static int maxPoolSize() { return 20000; } - - /// poolSize - total number of threads in the pool - /// largestPriority - highest priority is 0, lowest possible priority is - /// 100 and is reserved for default priority. largestPriority=4 would - /// result in PriorityQueues's being created for - /// priorities 0, 1, 2, 3, 4, and 100 - /// runSizes - Each entry represents the maximum number of concurrent running - /// commands for a priority given by the position in the array. - /// If a position is undefined, the default value is 1. - /// ex. 5, 10, 10, 3, 3 would apply to the priorities above as - /// priority 0 can have up to 5 concurrent running commands - /// priorities 1 and 2 can have up to 10 - /// priorities 3 and 4 can have up to 3 - /// minRunningSizes - Each entry represents the minimum number of threads - /// to be running (defaults to 0). Non-zero values can keep - /// lower priorities from being completely stared and/or - /// reduce deadlocks from high priorities depending on lower - /// priorities. - QdispPool(int poolSize, int largestPriority, std::vector const& maxRunSizes, - std::vector const& minRunningSizes); - QdispPool() = delete; - explicit QdispPool(bool unitTest); - QdispPool(QdispPool const&) = delete; - QdispPool& operator=(QdispPool const&) = delete; - - /// Lower priority numbers are higher priority. - /// Invalid priorities get the lowest priority (high priority number). - void queCmd(PriorityCommand::Ptr const& cmd, int priority) { _prQueue->queCmd(cmd, priority); } - - /// Commands on queue's with priority lower than default may not be run. - void shutdownPool() { - _prQueue->prepareShutdown(); - _pool->shutdownPool(); - } - - /// @return a json object with queue information. - nlohmann::json getJson() const { return _prQueue->getJson(); } - -private: - PriorityQueue::Ptr _prQueue; - util::ThreadPool::Ptr _pool; -}; - -} // namespace lsst::qserv::qdisp - -#endif /* LSST_QSERV_QDISP_QDISPPOOL_H_ */ diff --git a/src/qdisp/SharedResources.h b/src/qdisp/SharedResources.h index 37d06f701..6ca6eb8a3 100644 --- a/src/qdisp/SharedResources.h +++ b/src/qdisp/SharedResources.h @@ -25,19 +25,23 @@ // System headers #include -namespace lsst::qserv::qdisp { - +namespace lsst::qserv::util { // &&& delete class QdispPool; +} + +namespace lsst::qserv::qdisp { /// Put resources that all Executives need to share in one class to reduce /// the number of arguments passed. /// This class should be kept simple so it can easily be included in headers /// without undue compiler performances problems. + // &&& there's nothing in here but qdisppool!? Try to delete, but there + // &&& will probably be unit test issues. class SharedResources { public: using Ptr = std::shared_ptr; - static Ptr create(std::shared_ptr const& qdispPool) { + static Ptr create(std::shared_ptr const& qdispPool) { return Ptr(new SharedResources(qdispPool)); } @@ -46,13 +50,13 @@ class SharedResources { SharedResources& operator=(SharedResources const&) = delete; ~SharedResources() = default; - std::shared_ptr getQdispPool() { return _qdispPool; } + std::shared_ptr getQdispPool() { return _qdispPool; } //&&& delete private: - SharedResources(std::shared_ptr const& qdispPool) : _qdispPool(qdispPool) {} + SharedResources(std::shared_ptr const& qdispPool) : _qdispPool(qdispPool) {} /// Thread pool for handling Responses from XrdSsi. - std::shared_ptr _qdispPool; + std::shared_ptr _qdispPool; }; } // namespace lsst::qserv::qdisp diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 7da2079b0..002d48085 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -41,6 +41,7 @@ #include "qmeta/JobStatus.h" #include "util/Bug.h" #include "util/common.h" +#include "util/QdispPool.h" // LSST headers #include "lsst/log/Log.h" @@ -52,7 +53,7 @@ namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.UberJob"); } -namespace lsst { namespace qserv { namespace qdisp { +namespace lsst::qserv::qdisp { UberJob::Ptr UberJob::create(Executive::Ptr const& executive, std::shared_ptr const& respHandler, int queryId, int uberJobId, @@ -335,7 +336,7 @@ json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_ ujPtr->_importResultFinish(resultRows); }; - auto cmd = qdisp::PriorityCommand::Ptr(new qdisp::PriorityCommand(fileCollectFunc)); + auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(fileCollectFunc)); exec->queueFileCollect(cmd); // If the query meets the limit row complete complete criteria, it will start @@ -508,4 +509,4 @@ std::ostream& UberJob::dumpOS(std::ostream& os) const { return os; } -}}} // namespace lsst::qserv::qdisp +} // namespace lsst::qserv::qdisp diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 6dc56f0a5..3a599c5c8 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -30,7 +30,10 @@ #include "qdisp/JobBase.h" #include "qmeta/JobStatus.h" -// This header declarations +namespace lsst::qserv::util { +class QdispPool; +} + namespace lsst::qserv::qdisp { class JobQuery; @@ -70,7 +73,8 @@ class UberJob : public JobBase { return _uberJobId; } // TODO:UJ change name when JobBase no longer needed. std::string const& getIdStr() const override { return _idStr; } - std::shared_ptr getQdispPool() override { return _qdispPool; } + //&&&std::shared_ptr getQdispPool() override { return _qdispPool; } + //&&&std::shared_ptr getQdispPool() { return _qdispPool; } std::shared_ptr getRespHandler() override { return _respHandler; } std::shared_ptr getStatus() override { return _jobStatus; @@ -157,7 +161,7 @@ class UberJob : public JobBase { qmeta::CzarId const _czarId; std::string const _idStr; - std::shared_ptr _qdispPool; // TODO:UJ remove when possible. + std::shared_ptr _qdispPool; // TODO:UJ remove when possible. &&& delete // Map of workerData czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 22934d587..d3d2fa9f6 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -44,6 +44,7 @@ #include "qmeta/MessageStore.h" #include "qproc/ChunkQuerySpec.h" #include "qproc/TaskMsgFactory.h" +#include "util/QdispPool.h" #include "util/threadSafe.h" namespace test = boost::test_tools; @@ -64,12 +65,6 @@ namespace lsst::qserv::qproc { class MockTaskMsgFactory : public TaskMsgFactory { public: MockTaskMsgFactory(std::string const& mockPayload_) : TaskMsgFactory(), mockPayload(mockPayload_) {} - /* &&& - void serializeMsg(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, int jobId, - int attemptCount, qmeta::CzarId czarId, std::ostream& os) override { - os << mockPayload; - } - */ std::shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, QueryId queryId, int jobId, int attemptCount, @@ -161,7 +156,7 @@ class SetupTest { std::string str; qdisp::ExecutiveConfig::Ptr conf; std::shared_ptr ms; - qdisp::QdispPool::Ptr qdispPool; + util::QdispPool::Ptr qdispPool; qdisp::SharedResources::Ptr sharedResources; qdisp::Executive::Ptr ex; std::shared_ptr jqTest; // used only when needed @@ -169,11 +164,10 @@ class SetupTest { SetupTest(const char* request) { qrMsg = request; - //&&& qdisp::XrdSsiServiceMock::Reset(); str = qdisp::ExecutiveConfig::getMockStr(); conf = std::make_shared(str, 0); // No updating of QMeta. ms = std::make_shared(); - qdispPool = std::make_shared(true); + qdispPool = std::make_shared(true); sharedResources = qdisp::SharedResources::create(qdispPool); std::shared_ptr qStatus; // No updating QStatus, nullptr @@ -200,7 +194,6 @@ BOOST_AUTO_TEST_CASE(Executive) { int jobs = 0; _log.setLevel(LOG_LVL_DEBUG); // Ugly but boost test suite forces this std::thread timeoutT(&timeoutFunc, std::ref(done), millisInt); - //&&& qdisp::XrdSsiServiceMock::setRName("/chk/Mock/1234"); // Test single instance { @@ -234,19 +227,10 @@ BOOST_AUTO_TEST_CASE(Executive) { LOGS_DEBUG("Executive detect non-empty job queue test"); SetupTest tEnv("respdata"); SequentialInt sequence(0); - //&&&qdisp::XrdSsiServiceMock::setGo(false); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 5); jobs += 5; - /* &&& - while (qdisp::XrdSsiServiceMock::getCount() < jobs) { - LOGS_DEBUG("waiting for _count(" << qdisp::XrdSsiServiceMock::getCount() << ") == jobs(" << jobs - << ")"); - usleep(10000); - } - */ BOOST_CHECK(tEnv.ex->getEmpty() == false); - //&&&qdisp::XrdSsiServiceMock::setGo(true); LOGS_DEBUG("ex->joining()"); tEnv.ex->join(); LOGS_DEBUG("ex->join() joined"); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 800a469c1..8b9997888 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -20,6 +20,7 @@ target_sources(util PRIVATE Issue.cc MultiError.cc Mutex.cc + QdispPool.cc ResultFileNameParser.cc SemaMgr.cc StringHash.cc diff --git a/src/util/xrootd.cc b/src/util/xrootd.cc index a4f967faa..bde271719 100644 --- a/src/util/xrootd.cc +++ b/src/util/xrootd.cc @@ -31,6 +31,8 @@ // Third-party headers #include "boost/format.hpp" +/// &&& file seems unused, delete if possible + namespace lsst::qserv::util { std::string makeUrl(char const* hostport, char const* typeStr, int chunk) { diff --git a/src/util/xrootd.h b/src/util/xrootd.h index bf3c00f8b..947db582b 100644 --- a/src/util/xrootd.h +++ b/src/util/xrootd.h @@ -31,6 +31,8 @@ // Third-party headers #include +/// &&& file seems unused, delete if possible + namespace lsst::qserv::util { std::string makeUrl(char const* hostport, char const* typeStr, int chunk); diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index a7ede98fd..0319f7646 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -301,8 +301,7 @@ FileChannelShared::FileChannelShared(std::shared_ptr const& _czarPort(czarPort), _workerId(workerId), _protobufArena(make_unique()), - _scsId(scsSeqId++), - _useHttp(true) { + _scsId(scsSeqId++) { LOGS(_log, LOG_LVL_DEBUG, "FileChannelShared created scsId=" << _scsId << " ujId=" << _uberJobId); } @@ -315,14 +314,6 @@ FileChannelShared::~FileChannelShared() { if (isDead()) { _removeFile(lock_guard(_tMtx)); } - if (!_useHttp) { - if (_sendChannel != nullptr) { - _sendChannel->setDestroying(); - if (!_sendChannel->isDead()) { - _sendChannel->kill("~FileChannelShared()"); - } - } - } LOGS(_log, LOG_LVL_DEBUG, "~FileChannelShared end"); } @@ -341,12 +332,7 @@ bool FileChannelShared::kill(string const& note) { } bool FileChannelShared::isDead() { - if (!_useHttp) { - if (_sendChannel == nullptr) return true; - return _sendChannel->isDead(); - } else { - return _dead; - } + return _dead; } string FileChannelShared::makeIdStr(int qId, int jId) { @@ -357,18 +343,9 @@ string FileChannelShared::makeIdStr(int qId, int jId) { bool FileChannelShared::buildAndTransmitError(util::MultiError& multiErr, shared_ptr const& task, bool cancelled) { lock_guard const tMtxLock(_tMtx); - if (!_useHttp) { - if (!_sendResponse(tMtxLock, task, cancelled, multiErr)) { - LOGS(_log, LOG_LVL_ERROR, "Could not transmit the error message to Czar."); - return false; - } - return true; - } else { - // Delete the result file as nobody will come looking for it. - _kill(tMtxLock, " buildAndTransmitError"); - return _uberJobData->responseError(multiErr, task, cancelled); - } - return false; + // Delete the result file as nobody will come looking for it. + _kill(tMtxLock, " buildAndTransmitError"); + return _uberJobData->responseError(multiErr, task, cancelled); } bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptr const& task, @@ -471,16 +448,12 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptr const& streamMutexLock, string const& note) { LOGS(_log, LOG_LVL_DEBUG, "FileChannelShared::" << __func__ << " " << note); - if (!_useHttp) { - return _sendChannel->kill(note); - } else { - bool oldVal = _dead.exchange(true); - if (!oldVal) { - LOGS(_log, LOG_LVL_WARN, "FileChannelShared first kill call " << note); - } - _removeFile(streamMutexLock); - return oldVal; + bool oldVal = _dead.exchange(true); + if (!oldVal) { + LOGS(_log, LOG_LVL_WARN, "FileChannelShared first kill call " << note); } + _removeFile(streamMutexLock); + return oldVal; } bool FileChannelShared::_writeToFile(lock_guard const& tMtxLock, shared_ptr const& task, @@ -606,58 +579,8 @@ bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ // Prepare the response object and serialize in into a message that will // be sent to Czar. - if (!_useHttp) { -#if 0 //&&& - proto::ResponseSummary response; - response.set_wname(_workerId); - response.set_queryid(queryId); - response.set_jobid(jobId); - response.set_fileresource_xroot(task->resultFileXrootUrl()); - response.set_fileresource_http(task->resultFileHttpUrl()); - response.set_attemptcount(task->getAttemptCount()); - response.set_rowcount(_rowcount); - response.set_transmitsize(_transmitsize); - string errorMsg; - int errorCode = 0; - if (!multiErr.empty()) { - errorMsg = multiErr.toOneLineString(); - errorCode = multiErr.firstErrorCode(); - } else if (cancelled) { - errorMsg = "cancelled"; - errorCode = -1; - } - if (!errorMsg.empty() or (errorCode != 0)) { - errorMsg = "FileChannelShared::" + string(__func__) + " error(s) in result for chunk #" + - to_string(task->getChunkId()) + ": " + errorMsg; - response.set_errormsg(errorMsg); - response.set_errorcode(errorCode); - LOGS(_log, LOG_LVL_ERROR, errorMsg); - } - response.SerializeToString(&_responseBuf); - - LOGS(_log, LOG_LVL_DEBUG, - __func__ << " idStr=" << idStr << ", _responseBuf.size()=" << _responseBuf.size()); - - // Send the message sent out-of-band within the SSI metadata. - if (!_sendChannel->setMetadata(_responseBuf.data(), _responseBuf.size())) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " failed in setMetadata " << idStr); - _kill(streamMutexLock, "setMetadata"); - return false; - } - - // Send back the empty object since no info is expected by a caller - // for this type of requests beyond the usual error notifications (if any). - // Note that this call is needed to initiate the transaction. - if (!_sendChannel->sendData((char const*)0, 0)) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " failed in sendData " << idStr); - _kill(streamMutexLock, "sendData"); - return false; - } -#endif //&&& - } else { - string httpFileUrl = task->resultFileHttpUrl(); - _uberJobData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); - } + string httpFileUrl = task->resultFileHttpUrl(); + _uberJobData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); return true; } diff --git a/src/wbase/FileChannelShared.h b/src/wbase/FileChannelShared.h index 69e4268fe..fcffe4580 100644 --- a/src/wbase/FileChannelShared.h +++ b/src/wbase/FileChannelShared.h @@ -291,7 +291,6 @@ class FileChannelShared { uint64_t _transmitsize = 0; ///< The total amount of data (bytes) in all result sets of a query. uint64_t _headerCount = 0; ///< Count of headers received. - bool const _useHttp = false; ///< to be eliminated when xrootd is no longer used. std::atomic _dead{false}; ///< Set to true when the contents of the file are no longer useful. }; diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 6acfd97f6..0389632cc 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -218,9 +218,10 @@ std::vector Task::createTasksForChunk( std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort) { QueryId qId = ujData->getQueryId(); UberJobId ujId = ujData->getUberJobId(); + CzarIdType czId = ujData->getCzarId(); //&&&UserQueryInfo::Ptr userQueryInfo = UserQueryInfo::uqMapInsert(qId); - wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId); + wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); string funcN(__func__); diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index ac828fa4d..8a53810a5 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -22,6 +22,7 @@ // Class header #include "wbase/UberJobData.h" +#include "../wcontrol/WCzarInfoMap.h" // System headers // Third party headers @@ -76,9 +77,9 @@ void UberJobData::setFileChannelShared(std::shared_ptr const& void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount, uint64_t fileSize, uint64_t headerCount) { - string const funcN = cName(__func__); + //&&&string const funcN = cName(__func__); LOGS(_log, LOG_LVL_TRACE, - funcN << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize + cName(__func__) << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize << " headerCount=" << headerCount); string workerIdStr; @@ -86,7 +87,7 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount workerIdStr = _foreman->chunkInventory()->id(); } else { workerIdStr = "dummyWorkerIdStr"; - LOGS(_log, LOG_LVL_INFO, funcN << " _foreman was null, which should only happen in unit tests"); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " _foreman was null, which should only happen in unit tests"); } json request = {{"version", http::MetaModule::version}, @@ -105,36 +106,15 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount vector const headers = {"Content-Type: application/json"}; string const url = "http://" + _czarHost + ":" + to_string(_czarPort) + "/queryjob-ready"; string const requestContext = "Worker: '" + http::method2string(method) + "' request to '" + url + "'"; - http::Client client(method, url, request.dump(), headers); - - int maxTries = 2; // TODO:UJ set from config - bool transmitSuccess = false; - for (int j = 0; (!transmitSuccess && j < maxTries); ++j) { - try { - json const response = client.readAsJson(); - if (0 != response.at("success").get()) { - transmitSuccess = true; - } else { - LOGS(_log, LOG_LVL_WARN, funcN << "Transmit success == 0"); - j = maxTries; /// There's no point in resending as the czar got the message and didn't like - /// it. - } - } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, funcN + " " + requestContext + " failed, ex: " + ex.what()); - } - } - - if (!transmitSuccess) { - LOGS(_log, LOG_LVL_ERROR, - funcN << "TODO:UJ NEED CODE Let czar find out through polling worker status??? Just throw the " - "result away???"); - } + string const requestStr = request.dump(); + _queueUJResponse(method, headers, url, requestContext, requestStr); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& end"); } + bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const& task, bool cancelled) { - string const funcN = cName(__func__); - LOGS(_log, LOG_LVL_INFO, funcN); + LOGS(_log, LOG_LVL_INFO, cName(__func__)); string errorMsg; int errorCode = 0; if (!multiErr.empty()) { @@ -146,7 +126,7 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptrgetChunkId()) + ": " + errorMsg; + cName(__func__) + " error(s) in result for chunk #" + to_string(task->getChunkId()) + ": " + errorMsg; LOGS(_log, LOG_LVL_ERROR, errorMsg); } @@ -164,33 +144,136 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const headers = {"Content-Type: application/json"}; string const url = "http://" + _czarHost + ":" + to_string(_czarPort) + "/queryjob-error"; string const requestContext = "Worker: '" + http::method2string(method) + "' request to '" + url + "'"; - http::Client client(method, url, request.dump(), headers); + string const requestStr = request.dump(); + _queueUJResponse(method, headers, url, requestContext, requestStr); + return true; +} - int maxTries = 2; // TODO:UJ set from config - bool transmitSuccess = false; - for (int j = 0; !transmitSuccess && j < maxTries; ++j) { - try { - json const response = client.readAsJson(); - if (0 != response.at("success").get()) { - transmitSuccess = true; - } else { - LOGS(_log, LOG_LVL_WARN, funcN << " transmit success == 0"); - j = maxTries; /// There's no point in resending as the czar got the message and didn't like - /// it. - } - } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, funcN + " " + requestContext + " failed, ex: " + ex.what()); +void UberJobData::_queueUJResponse(http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { + util::QdispPool::Ptr wPool; + if (_foreman != nullptr) { + wPool = _foreman->getWPool(); + } + + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd wPool=" << wPool); + auto cmdTransmit = UJTransmitCmd::create(_foreman, shared_from_this(), method_, headers_, url_, requestContext_, requestStr_); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& created UJTransmitCmd wPool=" << wPool); + if (wPool == nullptr) { + // No thread pool. Run the command now. This should only happen in unit tests. + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd direct run action"); + cmdTransmit->action(nullptr); + } else { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit"); + if (_scanInteractive) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit_0"); + wPool->queCmd(cmdTransmit, 0); + }else { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit_1"); + wPool->queCmd(cmdTransmit, 1); } } - return transmitSuccess; } void UberJobData::cancelAllTasks() { LOGS(_log, LOG_LVL_INFO, cName(__func__)); - lock_guard lg(_ujTasksMtx); - for (auto const& task : _ujTasks) { - task->cancel(); + if (_cancelled.exchange(true) == false) { + lock_guard lg(_ujTasksMtx); + for (auto const& task : _ujTasks) { + task->cancel(); + } } } +string UJTransmitCmd::cName(const char* funcN) const { + stringstream os; + os << "UJTransmitCmd::" << funcN << " czId=" << _czarId << " qId=" << _queryId << " ujId=" << _uberJobId; + return os.str(); +} + +void UJTransmitCmd::action(util::CmdData* data) { + _attemptCount++; + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start attempt=" << _attemptCount); + auto ujPtr = _ujData.lock(); + if (ujPtr == nullptr || ujPtr->getCancelled()) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " UberJob was cancelled " << _attemptCount); + } + http::Client client(_method, _url, _requestStr, _headers); + bool transmitSuccess = false; + try { + json const response = client.readAsJson(); + if (0 != response.at("success").get()) { + transmitSuccess = true; + _selfPtr.reset(); // clear so this can be deleted. + } else { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); + // There's no point in re-sending as the czar got the message and didn't like + // it. + // &&& maybe add this czId+ujId to a list of failed uberjobs that can be put + // &&& status return??? Probably overkill. + _selfPtr.reset(); // clear so this can be deleted. + } + } catch (exception const& ex) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start d except"); + LOGS(_log, LOG_LVL_WARN, cName(__func__) + " " + _requestContext + " failed, ex: " + ex.what()); + } + + if (!transmitSuccess) { + auto sPtr = _selfPtr; + if (_foreman != nullptr && sPtr != nullptr) { + // Do not reset _selfPtr as re-queuing may be needed several times. + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " no response for transmit, putting on failed transmit queue."); + auto wCzInfo = _foreman->getWCzarInfoMap()->getWCzarInfo(_czarId); + // This will check if the czar is believed to be alive and try the queue the query to be tried again + // at a lower priority. It it thinks the czar is dead, it will throw it away. + // TODO:UJ &&& I have my doubts about this as a reconnected czar may go down in flames + // &&& as it is hit with thousands of these. + // &&& Alternate plan, set a flag in the status message response (WorkerQueryStatusData) + // &&& indicates some messages failed. When the czar sees the flag, it'll request a + // &&& message from the worker that contains all of the failed transmit data and handle + // &&& that. All of these failed transmits should fit in a single message. + if (wCzInfo->checkAlive(CLOCK::now())) { + auto wPool = _foreman->getWPool(); + if (wPool != nullptr) { + Ptr replacement = duplicate(); + _selfPtr.reset(); + if (replacement != nullptr) { + wPool->queCmd(replacement, 2); + } else { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " replacement was null"); + } + } else{ + // No thread pool, should only be possible in unit tests. + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " no wPool"); + _selfPtr.reset(); + return; + } + } + } else { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _selfPtr was null, assuming job killed."); + _selfPtr.reset(); // In case _foreman is null. + } + } + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start end"); +} + +void UJTransmitCmd::kill() { + string const funcN("UJTransmitCmd::kill"); + LOGS(_log, LOG_LVL_WARN, funcN); + auto sPtr = _selfPtr; + _selfPtr.reset(); + if (sPtr == nullptr) { return; } + // &&& TODO:UJ Is there anything that should be done here??? +} + +UJTransmitCmd::Ptr UJTransmitCmd::duplicate() { + auto ujD = _ujData.lock(); + if (ujD == nullptr) { + return nullptr; + } + Ptr newPtr = create(_foreman, ujD, _method, _headers, _url, _requestContext, _requestStr); + newPtr->_attemptCount = _attemptCount; + return newPtr; + +} + } // namespace lsst::qserv::wbase diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 03813979e..1af32e511 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -34,8 +34,11 @@ // Qserv headers #include "global/intTypes.h" +#include "http/Method.h" #include "qmeta/types.h" +#include "util/QdispPool.h" #include "wbase/SendChannel.h" +#include "util/InstanceCount.h" namespace lsst::qserv { @@ -55,7 +58,7 @@ class Task; /// This class tracks all Tasks associates with the UberJob on the worker /// and reports status to the czar. -class UberJobData { +class UberJobData : public std::enable_shared_from_this{ public: using Ptr = std::shared_ptr; @@ -72,6 +75,10 @@ class UberJobData { /// Set file channel for this UberJob void setFileChannelShared(std::shared_ptr const& fileChannelShared); + void setScanInteractive(bool scanInteractive) { + _scanInteractive = scanInteractive; + } + UberJobId getUberJobId() const { return _uberJobId; } qmeta::CzarId getCzarId() const { return _czarId; } std::string getCzarHost() const { return _czarHost; } @@ -95,6 +102,8 @@ class UberJobData { std::string getIdStr() const { return _idStr; } std::string cName(std::string const& funcName) { return "UberJobData::" + funcName + " " + getIdStr(); } + bool getCancelled() const { return _cancelled; } + /// &&& doc void cancelAllTasks(); @@ -103,6 +112,10 @@ class UberJobData { int czarPort, uint64_t queryId, std::string const& workerId, std::shared_ptr const& foreman, std::string const& authKey); + /// &&& doc + void _queueUJResponse(http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_); + + UberJobId const _uberJobId; std::string const _czarName; qmeta::CzarId const _czarId; @@ -120,6 +133,65 @@ class UberJobData { std::mutex _ujTasksMtx; ///< Protects _ujTasks. std::string const _idStr; + + std::atomic _scanInteractive; ///< &&& doc + + std::atomic _cancelled{false}; ///< Set to true if this was cancelled. +}; + +/// &&& doc +class UJTransmitCmd : public util::PriorityCommand { +public: + using Ptr = std::shared_ptr; + + UJTransmitCmd() = delete; + ~UJTransmitCmd() override = default; + + std::string cName(const char* funcN) const; + + /* &&& + static Ptr create(std::shared_ptr const& foreman_, CzarIdType czarId_, QueryId queryId_, UberJobId uberJobId_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { + auto ptr = Ptr(new UJTransmitCmd(foreman_, czarId_, queryId_, uberJobId_, method_, headers_, url_, requestContext_, requestStr_)); + ptr->_selfPtr = ptr; + return ptr; + } + */ + static Ptr create(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { + auto ptr = Ptr(new UJTransmitCmd(foreman_, ujData_, method_, headers_, url_, requestContext_, requestStr_)); + ptr->_selfPtr = ptr; + return ptr; + } + + /// This is the function that will be run when the queue gets to this command. + void action(util::CmdData* data) override; + + /// Reset the self pointer so this object can be killed. + void kill(); + + /// &&& + Ptr duplicate(); + +private: + /* &&& + UJTransmitCmd(std::shared_ptr const& foreman_, CzarIdType czarId_, QueryId queryId_, UberJobId uberJobId_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) + : PriorityCommand(), _foreman(foreman_), _czarId(czarId_), _queryId(queryId_), _uberJobId(uberJobId_), _method(method_), _headers(headers_), _url(url_), _requestContext(requestContext_), _requestStr(requestStr_) {} + */ + UJTransmitCmd(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) + : PriorityCommand(), _foreman(foreman_), _ujData(ujData_), _czarId(ujData_->getCzarId()), _queryId(ujData_->getQueryId()), _uberJobId(ujData_->getUberJobId()), _method(method_), _headers(headers_), _url(url_), _requestContext(requestContext_), _requestStr(requestStr_) {} + + Ptr _selfPtr; ///< So this object can put itself back on the queue and keep itself alive. + std::shared_ptr const _foreman; + std::weak_ptr const _ujData; + CzarIdType const _czarId; + QueryId const _queryId; + UberJobId const _uberJobId; + http::Method const _method; + std::vector const _headers; + std::string const _url; + std::string const _requestContext; + std::string const _requestStr; + int _attemptCount = 0; ///< How many attempts have been made to transmit this. + util::InstanceCount _ic{cName("&&&")}; }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/UserQueryInfo.cc b/src/wbase/UserQueryInfo.cc index 72d148060..ca1bacbc9 100644 --- a/src/wbase/UserQueryInfo.cc +++ b/src/wbase/UserQueryInfo.cc @@ -37,7 +37,7 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.UserQueryInfo"); namespace lsst::qserv::wbase { -UserQueryInfo::UserQueryInfo(QueryId qId) : _qId(qId) {} +UserQueryInfo::UserQueryInfo(QueryId qId, CzarIdType czarId) : _qId(qId), _czarId(czarId) {} size_t UserQueryInfo::addTemplate(std::string const& templateStr) { size_t j = 0; @@ -69,7 +69,7 @@ void UserQueryInfo::addUberJob(std::shared_ptr const& ujData) { _uberJobMap[ujId] = ujData; } -/// &&& doc + void UserQueryInfo::cancelFromCzar() { if (_cancelledByCzar.exchange(true)) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " already cancelledByCzar"); @@ -85,7 +85,6 @@ void UserQueryInfo::cancelFromCzar() { } } -/// &&& doc void UserQueryInfo::cancelUberJob(UberJobId ujId) { LOGS(_log, LOG_LVL_INFO, cName(__func__) << " cancelling ujId=" << ujId); lock_guard lockUq(_uberJobMapMtx); @@ -100,6 +99,17 @@ void UserQueryInfo::cancelUberJob(UberJobId ujId) { } } +void UserQueryInfo::cancelAllUberJobs() { + lock_guard lockUq(_uberJobMapMtx); + for (auto const& [ujKey, weakUjPtr] : _uberJobMap) { + _deadUberJobSet.insert(ujKey); + auto ujPtr = weakUjPtr.lock(); + if (ujPtr != nullptr) { + ujPtr->cancelAllTasks(); + } + } +} + bool UserQueryInfo::isUberJobDead(UberJobId ujId) const { lock_guard lockUq(_uberJobMapMtx); auto iter = _deadUberJobSet.find(ujId); diff --git a/src/wbase/UserQueryInfo.h b/src/wbase/UserQueryInfo.h index 4694d8834..1266ebe05 100644 --- a/src/wbase/UserQueryInfo.h +++ b/src/wbase/UserQueryInfo.h @@ -49,7 +49,7 @@ class UserQueryInfo { UserQueryInfo(UserQueryInfo const&) = delete; UserQueryInfo& operator=(UserQueryInfo const&) = delete; - static Ptr create(QueryId qId) { return std::shared_ptr(new UserQueryInfo(qId)); } + static Ptr create(QueryId qId, CzarIdType czarId) { return std::shared_ptr(new UserQueryInfo(qId, czarId)); } ~UserQueryInfo() = default; @@ -70,9 +70,15 @@ class UserQueryInfo { /// &&& doc bool getCancelledByCzar() const { return _cancelledByCzar; } - /// &&& doc + /// The czar has cancelled this user query, all tasks need to + /// be killed but there's no need to track UberJob id's anymore. void cancelFromCzar(); + /// Cancel all associated tasks and track the killed UberJob id's + /// The user query itself may still be alive, so the czar may need + /// information about which UberJobs are dead. + void cancelAllUberJobs(); + /// &&& doc void cancelUberJob(UberJobId ujId); @@ -80,10 +86,13 @@ class UserQueryInfo { QueryId getQueryId() const { return _qId; } + CzarIdType getCzarId() const { return _czarId; } + private: - UserQueryInfo(QueryId qId); + UserQueryInfo(QueryId qId, CzarIdType czId); QueryId const _qId; ///< The User Query Id number. + CzarIdType const _czarId; /// List of template strings. This is expected to be short, 1 or 2 entries. /// This must be a vector. New entries are always added to the end so as not diff --git a/src/wcontrol/CMakeLists.txt b/src/wcontrol/CMakeLists.txt index 92890a8c6..3a27ccd35 100644 --- a/src/wcontrol/CMakeLists.txt +++ b/src/wcontrol/CMakeLists.txt @@ -6,6 +6,7 @@ target_sources(wcontrol PRIVATE ResourceMonitor.cc SqlConnMgr.cc WorkerStats.cc + WCzarInfoMap.cc ) target_include_directories(wcontrol PRIVATE diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index 653c40be3..ef56ba618 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -39,9 +39,13 @@ #include "qhttp/Response.h" #include "qhttp/Server.h" #include "qhttp/Status.h" +#include "util/common.h" +#include "util/QdispPool.h" +#include "util/String.h" #include "wconfig/WorkerConfig.h" #include "wcontrol/ResourceMonitor.h" #include "wcontrol/SqlConnMgr.h" +#include "wcontrol/WCzarInfoMap.h" #include "wcontrol/WorkerStats.h" #include "wdb/ChunkResource.h" #include "wdb/SQLBackend.h" @@ -77,6 +81,25 @@ qhttp::Status removeResultFile(std::string const& fileName) { namespace lsst::qserv::wcontrol { +Foreman::Ptr Foreman::_globalForeman; + + +Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, + mysql::MySqlConfig const& mySqlConfig, wpublish::QueriesAndChunks::Ptr const& queries, + std::shared_ptr const& chunkInventory, + std::shared_ptr const& sqlConnMgr) { + // Latch + static std::atomic globalForemanSet{false}; + if (globalForemanSet.exchange(true) == true) { + throw util::Bug(ERR_LOC, "Foreman::create already an existing global Foreman."); + } + + Ptr fm = Ptr(new Foreman(scheduler, poolSize, maxPoolThreads, + mySqlConfig, queries, chunkInventory, sqlConnMgr)); + _globalForeman = fm; + return _globalForeman; +} + Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, wpublish::QueriesAndChunks::Ptr const& queries, std::shared_ptr const& chunkInventory, @@ -88,7 +111,8 @@ Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigne _sqlConnMgr(sqlConnMgr), _resourceMonitor(make_shared()), _io_service(), - _httpServer(qhttp::Server::create(_io_service, 0 /* grab the first available port */)) { + _httpServer(qhttp::Server::create(_io_service, 0 /* grab the first available port */)), + _wCzarInfoMap(WCzarInfoMap::create()) { // Make the chunk resource mgr // Creating backend makes a connection to the database for making temporary tables. // It will delete temporary tables that it can identify as being created by a worker. @@ -108,6 +132,28 @@ Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigne _mark = make_shared(ERR_LOC, "Forman Test Msg"); + /* &&& + int qPoolSize = _czarConfig->getQdispPoolSize(); + int maxPriority = std::max(0, _czarConfig->getQdispMaxPriority()); + string vectRunSizesStr = _czarConfig->getQdispVectRunSizes(); + vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); + string vectMinRunningSizesStr = _czarConfig->getQdispVectMinRunningSizes(); + vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); + */ + int qPoolSize = 50; // &&& TODO:UJ put in config + int maxPriority = 2; // &&& TODO:UJ put in config + string vectRunSizesStr = "10:10:10:10"; // &&& TODO:UJ put in config + vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); + string vectMinRunningSizesStr = "0:1:3:3"; // &&& TODO:UJ put in config + vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); + LOGS(_log, LOG_LVL_INFO, + "INFO wPool config qPoolSize=" << qPoolSize << " maxPriority=" << maxPriority << " vectRunSizes=" + << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) + << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " + << util::prettyCharList(vectMinRunningSizes)); + _wPool = + make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); + // Read-only access to the result files via the HTTP protocol's method "GET" // // NOTE: The following config doesn't seem to work due to multiple instances @@ -145,7 +191,7 @@ Foreman::~Foreman() { _httpServer->stop(); } -wpublish::QueryStatistics::Ptr Foreman::addQueryId(QueryId qId) { return _queries->addQueryId(qId); } +//&&& wpublish::QueryStatistics::Ptr Foreman::addQueryId(QueryId qId) { return _queries->addQueryId(qId); } void Foreman::processTasks(vector const& tasks) { std::vector cmds; diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index 7ba1c47e3..93775dc0b 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -39,6 +39,7 @@ #include "mysql/MySqlConfig.h" #include "util/EventThread.h" #include "util/HoldTrack.h" +#include "util/QdispPool.h" #include "wbase/Base.h" #include "wbase/Task.h" @@ -49,6 +50,7 @@ struct TaskSelector; } // namespace lsst::qserv::wbase namespace lsst::qserv::wcontrol { +class WCzarInfoMap; class ResourceMonitor; class SqlConnMgr; } // namespace lsst::qserv::wcontrol @@ -98,6 +100,9 @@ class Scheduler : public wbase::TaskScheduler, public util::CommandQueue { class Foreman { public: using Ptr = std::shared_ptr; + + static Ptr getForeman() { return _globalForeman; } + /** * @param scheduler - pointer to the scheduler * @param poolSize - size of the thread pool @@ -106,12 +111,11 @@ class Foreman { * @param chunkInventory - a collection of the SSI resources published by the worker * @param sqlConnMgr - for limiting the number of MySQL connections used for tasks */ - Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, + static Ptr create(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& queries, std::shared_ptr const& chunkInventory, std::shared_ptr const& sqlConnMgr); - //&&& virtual ~Foreman() override; ~Foreman(); // This class doesn't have the default construction or copy semantics @@ -131,15 +135,28 @@ class Foreman { /// Process a group of query processing tasks. void processTasks(std::vector> const& tasks); + /* &&& /// &&& doc std::shared_ptr addQueryId(QueryId qId); + */ /// Implement the corresponding method of the base class nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector); uint64_t getWorkerStartupTime() const { return _workerStartupTime; } + std::shared_ptr getWPool() const { return _wPool; } + + std::shared_ptr getWCzarInfoMap() const { return _wCzarInfoMap; } + + std::shared_ptr getQueriesAndChunks() const { return _queries; } + private: + Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, + mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& queries, + std::shared_ptr const& chunkInventory, + std::shared_ptr const& sqlConnMgr); + /// Startup time of worker, sent to czars so they can detect that the worker was /// was restarted when this value changes. uint64_t const _workerStartupTime = millisecSinceEpoch(CLOCK::now()); @@ -170,6 +187,14 @@ class Foreman { /// The HTTP server for serving/managing result files std::shared_ptr const _httpServer; + + /// Combined priority queue and thread pool for communicating with czars. + std::shared_ptr _wPool; + + /// Map of czar information for all czars that have contacted this worker. + std::shared_ptr const _wCzarInfoMap; + + static Ptr _globalForeman; ///< Pointer to the global instance. }; } // namespace lsst::qserv::wcontrol diff --git a/src/wpublish/QueriesAndChunks.cc b/src/wpublish/QueriesAndChunks.cc index 940be6698..d492168f5 100644 --- a/src/wpublish/QueriesAndChunks.cc +++ b/src/wpublish/QueriesAndChunks.cc @@ -119,12 +119,12 @@ void QueriesAndChunks::setBlendScheduler(shared_ptr cons void QueriesAndChunks::setRequiredTasksCompleted(unsigned int value) { _requiredTasksCompleted = value; } -QueryStatistics::Ptr QueriesAndChunks::addQueryId(QueryId qId) { +QueryStatistics::Ptr QueriesAndChunks::addQueryId(QueryId qId, CzarIdType czarId) { unique_lock guardStats(_queryStatsMapMtx); auto itr = _queryStatsMap.find(qId); QueryStatistics::Ptr stats; if (_queryStatsMap.end() == itr) { - stats = QueryStatistics::create(qId); + stats = QueryStatistics::create(qId, czarId); _queryStatsMap[qId] = stats; } else { stats = itr->second; @@ -135,6 +135,7 @@ QueryStatistics::Ptr QueriesAndChunks::addQueryId(QueryId qId) { /// Add statistics for the Task, creating a QueryStatistics object if needed. void QueriesAndChunks::addTask(wbase::Task::Ptr const& task) { auto qid = task->getQueryId(); + auto czId = task->getCzarId(); #if 0 // &&& delete upper block unique_lock guardStats(_queryStatsMapMtx); auto itr = _queryStatsMap.find(qid); @@ -148,7 +149,7 @@ void QueriesAndChunks::addTask(wbase::Task::Ptr const& task) { } guardStats.unlock(); #else // &&& - auto stats = addQueryId(qid); + auto stats = addQueryId(qid, czId); #endif // &&& stats->addTask(task); //&&&task->setQueryStatistics(stats); @@ -275,7 +276,7 @@ void QueriesAndChunks::removeDead(QueryStatistics::Ptr const& queryStats) { _queryStatsMap.erase(qId); } -QueryStatistics::Ptr QueriesAndChunks::getStats(QueryId const& qId) const { +QueryStatistics::Ptr QueriesAndChunks::getStats(QueryId qId) const { lock_guard lockG(_queryStatsMapMtx); return _getStats(qId); } @@ -690,6 +691,23 @@ vector QueriesAndChunks::removeQueryFrom(QueryId const& qId, return removedList; } +void QueriesAndChunks::killAllQueriesFromCzar(CzarIdType czarId) { + std::map qsMap; + { + lock_guard lgQsm(_queryStatsMapMtx); + qsMap = _queryStatsMap; + } + + for (auto const& [qsKey, qsPtr] : qsMap) { + if (qsPtr != nullptr) { + auto uqInfo = qsPtr->getUserQueryInfo(); + if (uqInfo != nullptr && uqInfo->getCzarId() == czarId) { + uqInfo->cancelAllUberJobs(); + } + } + } +} + ostream& operator<<(ostream& os, QueriesAndChunks const& qc) { lock_guard g(qc._chunkMtx); os << "Chunks("; diff --git a/src/wpublish/QueriesAndChunks.h b/src/wpublish/QueriesAndChunks.h index 83bcddf36..e5f1814cd 100644 --- a/src/wpublish/QueriesAndChunks.h +++ b/src/wpublish/QueriesAndChunks.h @@ -193,13 +193,18 @@ class QueriesAndChunks { void removeDead(); void removeDead(QueryStatistics::Ptr const& queryStats); - /// Return the statistics for a user query, may be nullptr + /// Return the statistics for a user query, may be nullptr, + /// in many cases addQueryId() may be preferable if + /// new information is being added to the returned object. /// @see addQueryId() - QueryStatistics::Ptr getStats(QueryId const& qId) const; + QueryStatistics::Ptr getStats(QueryId qId) const; /// Return the statistics for a user query, creating if needed. + /// Since it is possible to get messages out of order, there + /// are several case where something like a cancellation + /// message arrives before any tasks have been created. /// @see getStats() - QueryStatistics::Ptr addQueryId(QueryId qId); + QueryStatistics::Ptr addQueryId(QueryId qId, CzarIdType czarId); void addTask(wbase::Task::Ptr const& task); void queuedTask(wbase::Task::Ptr const& task); @@ -239,6 +244,9 @@ class QueriesAndChunks { }; using ScanTableSumsMap = std::map; + /// &&& doc + void killAllQueriesFromCzar(CzarIdType czarId); + friend std::ostream& operator<<(std::ostream& os, QueriesAndChunks const& qc); private: diff --git a/src/wpublish/QueryStatistics.cc b/src/wpublish/QueryStatistics.cc index 607288658..8aacf1c8d 100644 --- a/src/wpublish/QueryStatistics.cc +++ b/src/wpublish/QueryStatistics.cc @@ -50,8 +50,8 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wpublish.QueriesAndChunks"); namespace lsst::qserv::wpublish { -QueryStatistics::QueryStatistics(QueryId const& qId_) - : creationTime(CLOCK::now()), queryId(qId_), _userQueryInfo(wbase::UserQueryInfo::create(qId_)) { +QueryStatistics::QueryStatistics(QueryId qId_, CzarIdType czarId_) + : creationTime(CLOCK::now()), queryId(qId_), _userQueryInfo(wbase::UserQueryInfo::create(qId_, czarId_)) { /// For all of the histograms, all entries should be kept at least until the work is finished. string qidStr = to_string(queryId); _histSizePerTask = util::Histogram::Ptr(new util::Histogram( diff --git a/src/wpublish/QueryStatistics.h b/src/wpublish/QueryStatistics.h index dbacd5d53..c15e8e9f6 100644 --- a/src/wpublish/QueryStatistics.h +++ b/src/wpublish/QueryStatistics.h @@ -58,8 +58,8 @@ class QueryStatistics { using Ptr = std::shared_ptr; /// Force shared_ptr creation for data integrity. - static Ptr create(QueryId const& queryId) { - return std::shared_ptr(new QueryStatistics(queryId)); + static Ptr create(QueryId queryId_, CzarIdType czarId_) { + return std::shared_ptr(new QueryStatistics(queryId_, czarId_)); } QueryStatistics() = delete; @@ -172,7 +172,7 @@ class QueryStatistics { friend std::ostream& operator<<(std::ostream& os, QueryStatistics const& q); private: - explicit QueryStatistics(QueryId const& queryId); + explicit QueryStatistics(QueryId queryId, CzarIdType czarId); bool _isMostlyDead() const; mutable std::mutex _qStatsMtx; @@ -201,7 +201,6 @@ class QueryStatistics { SchedTasksInfoMap _taskSchedInfoMap; ///< Map of task information ordered by scheduler name. std::shared_ptr const _userQueryInfo; ///< &&& doc - util::InstanceCount _ic{"QueryStatiscs_&&&"}; }; } // namespace lsst::qserv::wpublish diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 97cb34b54..b5956cd7b 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -45,8 +45,9 @@ #include "wbase/Task.h" #include "wbase/UberJobData.h" #include "wbase/UserQueryInfo.h" -#include "wconfig/WorkerConfig.h" #include "wcontrol/Foreman.h" +#include "wcontrol/WCzarInfoMap.h" +#include "wconfig/WorkerConfig.h" #include "wcontrol/ResourceMonitor.h" #include "wpublish/ChunkInventory.h" #include "wpublish/QueriesAndChunks.h" @@ -130,7 +131,8 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId); // Get or create QueryStatistics and UserQueryInfo instances. - auto queryStats = foreman()->addQueryId(ujQueryId); + //&&&auto queryStats = foreman()->addQueryId(ujQueryId); + auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzarId); auto userQueryInfo = queryStats->getUserQueryInfo(); if (userQueryInfo->getCancelledByCzar()) { @@ -197,6 +199,8 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { scanInfo.scanRating = jdScanPriority; } + ujData->setScanInteractive(jdScanInteractive); + // create tasks and add them to ujData auto chunkTasks = wbase::Task::createTasksForChunk( ujData, ujJobs, channelShared, scanInfo, jdScanInteractive, jdMaxTableSize, @@ -207,6 +211,12 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { channelShared->setTaskCount(ujTasks.size()); ujData->addTasks(ujTasks); + // At this point, it looks like the message was sent successfully, update + // czar touched time. + wcontrol::WCzarInfoMap::Ptr wCzarMap = foreman()->getWCzarInfoMap(); + wcontrol::WCzarInfo::Ptr wCzarInfo = wCzarMap->getWCzarInfo(czarId); + wCzarInfo->czarMsgReceived(CLOCK::now()); + util::Timer timer; timer.start(); foreman()->processTasks(ujTasks); // Queues tasks to be run later. @@ -246,11 +256,17 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { auto wqsData = http::WorkerQueryStatusData::createFromJson(jsReq, replicationInstanceId, replicationAuthKey, now); + auto const czInfo = wqsData->getCzInfo(); + CzarIdType czId = czInfo->czId; + wcontrol::WCzarInfoMap::Ptr wCzarMap = foreman()->getWCzarInfoMap(); + wcontrol::WCzarInfo::Ptr wCzarInfo = wCzarMap->getWCzarInfo(czId); + wCzarInfo->czarMsgReceived(CLOCK::now()); + // For all queryId and czarId items, if the item can't be found, it is simply ignored. Anything that // is missed will eventually be picked up by other mechanisms, such as results being rejected // by the czar. - // If a czar was restarted, cancel and/or delete the abandoned items. + // If a czar was restarted, cancel and delete the abandoned items. if (wqsData->isCzarRestart()) { auto restartCzarId = wqsData->getCzarRestartCzarId(); auto restartQId = wqsData->getCzarRestartQueryId(); @@ -267,7 +283,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // Cancelled queries where we want to keep the files lock_guard mapLg(wqsData->mapMtx); for (auto const& [dkQid, dkTm] : wqsData->qIdDoneKeepFiles) { - auto qStats = queriesAndChunks->addQueryId(dkQid); + auto qStats = queriesAndChunks->addQueryId(dkQid, czId); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); if (uqInfo != nullptr) { @@ -280,7 +296,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { vector deleteFilesList; for (auto const& [dkQid, dkTm] : wqsData->qIdDoneDeleteFiles) { - auto qStats = queriesAndChunks->addQueryId(dkQid); + auto qStats = queriesAndChunks->addQueryId(dkQid, czId); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); if (uqInfo != nullptr) { @@ -302,7 +318,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // New UberJob Id's will be checked against the list, and immediately be // killed if they are on it. (see HttpWorkerCzarModule::_handleQueryJob) for (auto const& [ujQid, ujIdMap] : wqsData->qIdDeadUberJobs) { - auto qStats = queriesAndChunks->addQueryId(ujQid); + auto qStats = queriesAndChunks->addQueryId(ujQid, czId); if (qStats != nullptr) { auto uqInfo = qStats->getUserQueryInfo(); if (uqInfo != nullptr) { diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index 893f7c198..aa707551b 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -236,9 +236,12 @@ SsiService::SsiService(XrdSsiLogger* log) { LOGS(_log, LOG_LVL_WARN, "config sqlConnMgr" << *sqlConnMgr); LOGS(_log, LOG_LVL_WARN, "maxPoolThreads=" << maxPoolThreads); + /* &&& _foreman = wcontrol::Foreman::Ptr(new wcontrol::Foreman(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, ::makeChunkInventory(mySqlConfig), sqlConnMgr)); + */ + _foreman = wcontrol::Foreman::create(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, ::makeChunkInventory(mySqlConfig), sqlConnMgr); // Watch to see if the log configuration is changed. // If LSST_LOG_CONFIG is not defined, there's no good way to know what log From 61f1a9b24854a47d2e5b5ae3a6b89260b5d8cb7d Mon Sep 17 00:00:00 2001 From: John Gates Date: Tue, 1 Oct 2024 16:26:34 -0700 Subject: [PATCH 08/22] Added worker believed czar was dead handling. --- src/czar/ActiveWorker.cc | 7 +- src/czar/ActiveWorker.h | 84 ++++++++++++---------- src/czar/Czar.cc | 13 +++- src/czar/Czar.h | 4 +- src/czar/HttpCzarWorkerModule.cc | 45 ++++++++++++ src/czar/HttpCzarWorkerModule.h | 6 ++ src/czar/HttpSvc.cc | 5 ++ src/http/WorkerQueryStatusData.cc | 77 +++++++++++++++++++- src/http/WorkerQueryStatusData.h | 108 ++++++++++++++++++++++++----- src/http/testStatusData.cc | 50 ++++++++++++- src/qdisp/CzarStats.h | 1 - src/qdisp/Executive.cc | 36 ++++++++-- src/qdisp/Executive.h | 12 ++-- src/qdisp/SharedResources.h | 8 +-- src/qdisp/UberJob.cc | 3 +- src/qdisp/UberJob.h | 4 +- src/wbase/FileChannelShared.cc | 4 +- src/wbase/UberJobData.cc | 52 ++++++++------ src/wbase/UberJobData.h | 61 ++++++++-------- src/wbase/UserQueryInfo.cc | 1 - src/wbase/UserQueryInfo.h | 4 +- src/wcontrol/Foreman.cc | 31 ++++----- src/wcontrol/Foreman.h | 14 ++-- src/wpublish/QueriesAndChunks.cc | 17 +---- src/wpublish/QueryStatistics.cc | 4 +- src/xrdsvc/HttpWorkerCzarModule.cc | 12 ++++ src/xrdsvc/SsiService.cc | 8 +-- 27 files changed, 484 insertions(+), 187 deletions(-) diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 604720cd0..ed0e445c5 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -96,7 +96,6 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti case ALIVE: { if (secsSinceUpdate >= timeoutAliveSecs) { _changeStateTo(QUESTIONABLE, secsSinceUpdate, cName(__func__)); - // &&& Anything else that should be done here? } break; } @@ -152,10 +151,8 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti jsWorkerReqPtr = _wqsData->serializeJson(maxLifetime); } - // &&& Maybe only send the status message if the lists are not empty ??? - // Start a thread to send the message. (Maybe these should go on the qdisppool? &&&) - // put this in a different function and start the thread.&&&; - //&&& _sendStatusMsg(wInfo_, jsWorkerReqPtr); + // Always send the message as it's a way to inform the worker that this + // czar is functioning and capable of receiving requests. Ptr thisPtr = shared_from_this(); auto sendStatusMsgFunc = [thisPtr, wInfo_, jsWorkerReqPtr](util::CmdData*) { thisPtr->_sendStatusMsg(wInfo_, jsWorkerReqPtr); diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index b376bb13a..0ddc1f9d5 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -38,38 +38,35 @@ // This header declarations namespace lsst::qserv::czar { -/// &&& doc - maintain list of done/cancelled queries for an active worker, and send -/// that list to the worker. Once the worker has accepted the list, remove -/// all of those queryId's from the list. -/// - maintain a list of killed UberJobs. If an UberJob is killed, nothing -/// will every look for its files, so they should be deleted, and the -/// worker should avoid working on Tasks for that UberJob. -/// The only UberJob deaths that need to be sent to a worker is when -/// the czar kills an UberJob because the worker died/vanished, and -/// the only time this would be sent is when a worker came back from -/// the dead. -/// The reason this only applies to died/vanished workers is that all -/// other workers know their UberJobs are dead because the worker killed -/// them. If the worker isn't told, it will continue working on -/// the UberJob until it finishes, and then find out the UberJob was killed -/// when it tries to return results to the czar (worker should delete files -/// for said UberJob at that point). -/// So, this should be very rare, only results in extra load, and therefore -/// is a low priority. +/// - maintain list of done/cancelled queries for an active worker, and send +/// that list to the worker. Once the worker has accepted the list, remove +/// all of those queryId's from the list. +/// - maintain a list of killed UberJobs. If an UberJob is killed, nothing +/// will every look for its files, so they should be deleted, and the +/// worker should avoid working on Tasks for that UberJob. +/// The only UberJob deaths that need to be sent to a worker is when +/// the czar kills an UberJob because the worker died/vanished, and +/// the only time this would be sent is when a worker came back from +/// the dead. +/// The reason this only applies to died/vanished workers is that all +/// other workers know their UberJobs are dead because the worker killed +/// them. If the worker isn't told, it will continue working on +/// the UberJob until it finishes, and then find out the UberJob was killed +/// when it tries to return results to the czar (worker should delete files +/// for said UberJob at that point). +/// So, this should be very rare, only results in extra load, and therefore +/// is a low priority. /// -/// If a worker goes missing from the registry, it is considered DEAD and will be -/// removed after a period of time. -/// If a worker hasn't been heard from in (timeout period), it is considered QUESIONABLE. -/// When switching to QUESTIONABLE, a message will be sent to the worker asking -/// for an update. -/// If a QUESTIONABLE worker hasn't been heard from in (timeout period), its state is changed -/// to LOST_CONTACT and a message is sent to the worker asking for an update. -/// If a LOST_CONTACT worker hasn't been heard from in (timeout period), it becomes DEAD. +/// If a worker goes missing from the registry, it is considered DEAD and may be +/// removed after a period of time. +/// If a worker hasn't been heard from in (timeout period), it is considered QUESIONABLE. +/// If a QUESTIONABLE worker hasn't been heard from in (timeout period), its state is changed +/// to DEAD. /// -/// When a worker becomes DEAD: (this should probably all happen in _monitor). -/// - Affected UberJobs are killed. -/// - maps are remade without the dead workers -/// - uberjobs built to handle unassigned jobs. +/// When a worker becomes DEAD: (see Czar::_monitor). +/// - Affected UberJobs are killed. +/// - maps are remade without the dead workers +/// - uberjobs built to handle unassigned jobs. /// class ActiveWorker : public std::enable_shared_from_this { public: @@ -107,24 +104,38 @@ class ActiveWorker : public std::enable_shared_from_this { ~ActiveWorker() = default; - /// &&& doc + /// Return true if there were differences in worker id, host, or port values. bool compareContactInfo(http::WorkerContactInfo const& wcInfo) const; void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo); - /// &&& doc + /// Check this workers state (by looking at contact information) and queue + /// the WorkerQueryStatusData message `_wqsData` to be sent if this worker + /// isn't DEAD. void updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime); - /// &&& doc + /// Add `qId` to list of QueryId's that the worker can discard all tasks and + /// result files for. This `qId` will be removed from the list once the worker + /// has responded to the `_wqsData` message with this `qId` in the appropriate + /// list. + /// It is expected that all completed or cancelled queries on this worker will + /// be added to this list. void addToDoneDeleteFiles(QueryId qId); - /// &&& doc + /// Add `qId` to list of QueryId's that the worker where the worker must hold + /// onto result files but tasks can be eliminated. This `qId` will be removed + /// from the list once the worker has responded to the `_wqsData` message with + /// this `qId` in the appropriate list. void addToDoneKeepFiles(QueryId qId); - /// &&&doc + /// Add the uberjob to the list of dead uberjobs. This `qId` will be removed + /// from the list once the worker has responded to the `_wqsData` message with + /// this `qId` in the appropriate list. Or the `qId` is in a + /// removeDeadUberJobsFor() call. void addDeadUberJob(QueryId qId, UberJobId ujId); - /// &&& doc + /// If a query is completed or cancelled, there's no reason to track the + /// individual UberJobs anymore, so this function will get rid of them. void removeDeadUberJobsFor(QueryId qId); std::string dump() const; @@ -162,7 +173,6 @@ class ActiveWorker : public std::enable_shared_from_this { /// The number of communication threads currently in use by this class instance. std::atomic _conThreadCount{0}; int _maxConThreadCount{2}; - }; /// &&& doc diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index 2b61d33d7..c2df9e545 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -192,8 +192,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " << util::prettyCharList(vectMinRunningSizes)); - _qdispPool = - make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); + _qdispPool = make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); qdisp::CzarStats::setup(_qdispPool); @@ -686,6 +685,16 @@ std::shared_ptr Czar::getExecutiveFromMap(QueryId qId) { return exec; } +std::map> Czar::getExecMapCopy() const { + // Copy list of executives so the mutex isn't held forever. + std::map> execMap; + { + lock_guard lgMap(_executiveMapMtx); + execMap = _executiveMap; + } + return execMap; +} + void Czar::killIncompleteUbjerJobsOn(std::string const& restartedWorkerId) { // Copy list of executives so the mutex isn't held forever. std::map> execMap; diff --git a/src/czar/Czar.h b/src/czar/Czar.h index c8dc221c6..b913a8fbf 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -147,6 +147,8 @@ class Czar { std::shared_ptr getActiveWorkerMap() const { return _activeWorkerMap; } + std::map> getExecMapCopy() const; + /// &&& doc void killIncompleteUbjerJobsOn(std::string const& workerId); @@ -220,7 +222,7 @@ class Czar { /// Connection to the registry to register the czar and get worker contact information. std::shared_ptr _czarRegistry; - std::mutex _executiveMapMtx; ///< protects _executiveMap + mutable std::mutex _executiveMapMtx; ///< protects _executiveMap std::map> _executiveMap; ///< Map of executives for queries in progress. diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index 471bacee2..5f82eb2be 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -68,6 +68,8 @@ json HttpCzarWorkerModule::executeImpl(string const& subModuleName) { return _queryJobError(); else if (subModuleName == "QUERYJOB-READY") return _queryJobReady(); + else if (subModuleName == "WORKERCZARCOMISSUE") + return _workerCzarComIssue(); throw invalid_argument(context() + func + " unsupported sub-module"); } @@ -87,6 +89,14 @@ json HttpCzarWorkerModule::_queryJobReady() { return ret; } +json HttpCzarWorkerModule::_workerCzarComIssue() { + debug(__func__); + checkApiVersion(__func__, 34); + LOGS(_log, LOG_LVL_DEBUG, __func__ << " workerczarcomissue json=" << body().objJson); + auto ret = _handleWorkerCzarComIssue(__func__); + return ret; +} + json HttpCzarWorkerModule::_handleJobError(string const& func) { // Metadata-only responses for the file-based protocol should not have any data @@ -166,4 +176,39 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { return jsRet; } +json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { + // Parse and verify the json message and then deal with the problems. + json jsRet = {{"success", 1}, {"errortype", "unknown"}, {"note", "initialized"}}; + try { + string const replicationInstanceId = cconfig::CzarConfig::instance()->replicationInstanceId(); + string const replicationAuthKey = cconfig::CzarConfig::instance()->replicationAuthKey(); + auto const& jsReq = body().objJson; + auto wccIssue = + http::WorkerCzarComIssue::createFromJson(jsReq, replicationInstanceId, replicationAuthKey); + + auto wId = wccIssue->getWorkerInfo()->wId; + if (wccIssue->getThoughtCzarWasDead()) { + LOGS(_log, LOG_LVL_WARN, + "HttpCzarWorkerModule::_handleWorkerCzarComIssue worker=" + << wId << " thought czar was dead and killed related uberjobs."); + + // Find all incomplete UberJobs with this workerId and re-assign them. + // Use a copy to avoid mutex issues. + auto execMap = czar::Czar::getCzar()->getExecMapCopy(); + for (auto const& [exKey, execWeak] : execMap) { + auto execPtr = execWeak.lock(); + if (execPtr == nullptr) continue; + execPtr->killIncompleteUberJobsOnWorker(wId); + } + } + + } catch (std::invalid_argument const& iaEx) { + LOGS(_log, LOG_LVL_ERROR, + "HttpCzarWorkerModule::_handleWorkerCzarComIssue received " << iaEx.what() + << " js=" << body().objJson); + jsRet = {{"success", 0}, {"errortype", "parse"}, {"note", iaEx.what()}}; + } + return jsRet; +} + } // namespace lsst::qserv::czar diff --git a/src/czar/HttpCzarWorkerModule.h b/src/czar/HttpCzarWorkerModule.h index 69f4a3fef..a6d21536c 100644 --- a/src/czar/HttpCzarWorkerModule.h +++ b/src/czar/HttpCzarWorkerModule.h @@ -70,11 +70,17 @@ class HttpCzarWorkerModule : public QhttpModule { /// Called to indicate an UberJob is ready with data that needs to be collected. nlohmann::json _queryJobReady(); + /// Called to indicate there were problems with the worker trying to reach this czar. + nlohmann::json _workerCzarComIssue(); + /// Translates the message and calls the Czar to collect the data. nlohmann::json _handleJobReady(std::string const& func); /// Translates the error and calls the Czar to take action. nlohmann::json _handleJobError(std::string const& func); + + /// Translates the issues and calls the Czar to take action. + nlohmann::json _handleWorkerCzarComIssue(std::string const& func); }; } // namespace lsst::qserv::czar diff --git a/src/czar/HttpSvc.cc b/src/czar/HttpSvc.cc index b67330e27..3d953cdab 100644 --- a/src/czar/HttpSvc.cc +++ b/src/czar/HttpSvc.cc @@ -101,6 +101,11 @@ uint16_t HttpSvc::start() { [self](shared_ptr const& req, shared_ptr const& resp) { HttpCzarWorkerModule::process(::serviceName, req, resp, "QUERYJOB-READY"); }}}); + _httpServerPtr->addHandlers( + {{"POST", "/workerczarcomissue", + [self](shared_ptr const& req, shared_ptr const& resp) { + HttpCzarWorkerModule::process(::serviceName, req, resp, "WORKERCZARCOMISSUE"); + }}}); _httpServerPtr->start(); // Initialize the I/O context and start the service threads. At this point diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index e9524a26e..247efa04b 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -71,7 +71,7 @@ CzarContactInfo::Ptr CzarContactInfo::createFromJson(nlohmann::json const& czJso std::string CzarContactInfo::dump() const { stringstream os; //&&& os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << - //czHostName; + // czHostName; os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << czHostName << " czStartupTime=" << czStartupTime; return os.str(); @@ -434,4 +434,79 @@ string WorkerQueryStatusData::_dump() const { return os.str(); } +shared_ptr WorkerCzarComIssue::serializeJson() { + shared_ptr jsCzarReqPtr = make_shared(); + json& jsCzarR = *jsCzarReqPtr; + lock_guard _lgWciMtx(_wciMtx); + if (_wInfo == nullptr || _czInfo == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _wInfo or _czInfo was null"); + return jsCzarReqPtr; + } + //&&&auto now = CLOCK::now(); + jsCzarR["version"] = http::MetaModule::version; + jsCzarR["instance_id"] = _replicationInstanceId; + jsCzarR["auth_key"] = _replicationAuthKey; + jsCzarR["czar"] = _czInfo->serializeJson(); + jsCzarR["worker"] = _wInfo->serializeJson(); + + jsCzarR["thoughtczarwasdead"] = _thoughtCzarWasDead; + + // &&& add list of failed transmits + + return jsCzarReqPtr; +} + +WorkerCzarComIssue::Ptr WorkerCzarComIssue::createFromJson(nlohmann::json const& jsCzarReq, + std::string const& replicationInstanceId_, + std::string const& replicationAuthKey_) { + string const fName("WorkerCzarComIssue::createFromJson"); + LOGS(_log, LOG_LVL_WARN, fName << " &&& a"); + try { + if (jsCzarReq["version"] != http::MetaModule::version) { + LOGS(_log, LOG_LVL_ERROR, fName << " bad version"); + return nullptr; + } + + LOGS(_log, LOG_LVL_ERROR, fName << " &&& b"); + auto czInfo_ = CzarContactInfo::createFromJson(jsCzarReq["czar"]); + LOGS(_log, LOG_LVL_ERROR, fName << " &&& c"); + auto now = CLOCK::now(); + auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsCzarReq["worker"], now); + LOGS(_log, LOG_LVL_ERROR, fName << " && d"); + if (czInfo_ == nullptr || wInfo_ == nullptr) { + LOGS(_log, LOG_LVL_ERROR, fName << " or worker info could not be parsed in " << jsCzarReq); + } + //&&&auto wccIssue = create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); + auto wccIssue = create(replicationInstanceId_, replicationAuthKey_); + wccIssue->setContactInfo(wInfo_, czInfo_); + LOGS(_log, LOG_LVL_ERROR, fName << " &&& e"); + wccIssue->_thoughtCzarWasDead = RequestBodyJSON::required(jsCzarReq, "thoughtczarwasdead"); + LOGS(_log, LOG_LVL_ERROR, fName << " &&& end"); + return wccIssue; + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::createJson invalid ") << exc.what()); + } + return nullptr; +} + +json WorkerCzarComIssue::serializeResponseJson() { + json jsResp = {{"success", 1}, {"errortype", "none"}, {"note", ""}}; + + // TODO:UJ &&& add lists of uberjobs that are scheduled to have files collected because of this message. + return jsResp; +} + +string WorkerCzarComIssue::dump() const { + lock_guard _lgWciMtx(_wciMtx); + return _dump(); +} + +string WorkerCzarComIssue::_dump() const { + stringstream os; + os << "WorkerCzarComIssue wInfo=" << ((_wInfo == nullptr) ? "?" : _wInfo->dump()); + os << " czInfo=" << _czInfo->dump(); + os << " thoughtCzarWasDead=" << _thoughtCzarWasDead; + return os.str(); +} + } // namespace lsst::qserv::http diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index c56c148b0..63066fc21 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -39,7 +39,7 @@ namespace lsst::qserv::http { /// This class just contains the czar id and network contact information. -class CzarContactInfo { +class CzarContactInfo : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; std::string cName(const char* fnc) const { return std::string("CzarContactInfo") + fnc; } @@ -164,19 +164,6 @@ class WorkerContactInfo { return _regUpdate; } - /* &&& - /// Sets _wStartupTime to startupTime, but only if _wStartupTime was 0. - /// @returns true if _wStartupTime was set. - bool setWStartupTime(uint64_t startupTime) { //&&& del if not used - std::lock_guard lg(_rMtx); - if (_wStartupTime == 0) { - _wStartupTime = startupTime; - return true; - } - return false; - } - */ - /// @return true if startupTime equals _wStartupTime or _wStartupTime was never set, /// if _wStartupTime was never set, it is set to startupTime. /// @return false indicates the worker was restarted and all associated jobs need @@ -317,16 +304,11 @@ class WorkerQueryStatusData { void parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm); /// &&& doc - //&&&nlohmann::json serializeResponseJson(); nlohmann::json serializeResponseJson(uint64_t workerStartupTime); /// &&& doc - //&&&bool handleResponseJson(nlohmann::json const& jsResp); std::pair handleResponseJson(nlohmann::json const& jsResp); - /// &&& doc - ///&&&void handleCzarRestart(); - /// &&& doc static void parseListsInto(nlohmann::json const& jsWR, TIMEPOINT updateTm, std::map& doneKeepF, @@ -366,6 +348,94 @@ class WorkerQueryStatusData { std::string _dump() const; }; +/// &&& doc +/// This class is used to send/receive a message from the worker to a specific +/// czar when there has been a communication issue with the worker sending UberJob +/// file ready messages. If there have been timeouts, the worker will send this +/// message to the czar immediately after the worker receives a +/// WorkerQueryStatusData message from the czar (indicating that communication +/// is now possible). +/// If communication with the czar has failed for a long time, the worker +/// will set "_thoughtCzarWasDead" and delete all incomplete work associated +/// with that czar. Result files will remain until garbage cleanup or the czar +/// calls for their removal. +/// TODO:UJ &&& UberJob complete messages that failed to be sent to the czar +/// TODO:UJ &&& will be added to this message. +/// Upon successful completion, the worker will clear all values set by the +/// the czar. +/// This message is expected to only be needed rarely. +class WorkerCzarComIssue { +public: + using Ptr = std::shared_ptr; + + WorkerCzarComIssue() = delete; + ~WorkerCzarComIssue() = default; + + std::string cName(const char* funcN) { return std::string("WorkerCzarComIssue") + funcN; } + + static Ptr create(std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) { + return Ptr(new WorkerCzarComIssue(replicationInstanceId_, replicationAuthKey_)); + } + + static Ptr createFromJson(nlohmann::json const& workerJson, std::string const& replicationInstanceId_, + std::string const& replicationAuthKey_); + + void setThoughtCzarWasDead(bool wasDead) { + std::lock_guard lg(_wciMtx); + _thoughtCzarWasDead = wasDead; + } + + bool getThoughtCzarWasDead() const { return _thoughtCzarWasDead; } + + /// &&& doc + bool needToSend() const { + std::lock_guard lg(_wciMtx); + return _thoughtCzarWasDead; // &&& or list of failed transmits not empty. + } + + /// &&& doc + void setContactInfo(WorkerContactInfo::Ptr const& wInfo_, CzarContactInfo::Ptr const& czInfo_) { + std::lock_guard lgWci(_wciMtx); + if (_wInfo == nullptr && wInfo_ != nullptr) _wInfo = wInfo_; + if (_czInfo == nullptr && czInfo_ != nullptr) _czInfo = czInfo_; + } + + CzarContactInfo::Ptr getCzarInfo() const { + std::lock_guard lgWci(_wciMtx); + return _czInfo; + } + + WorkerContactInfo::Ptr getWorkerInfo() const { + std::lock_guard lgWci(_wciMtx); + return _wInfo; + } + + /// &&& doc + std::shared_ptr serializeJson(); + + /// &&& doc + nlohmann::json serializeResponseJson(); + + std::string dump() const; + +private: + WorkerCzarComIssue(std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) + : _replicationInstanceId(replicationInstanceId_), _replicationAuthKey(replicationAuthKey_) {} + + std::string _dump() const; + + WorkerContactInfo::Ptr _wInfo; + CzarContactInfo::Ptr _czInfo; + std::string const _replicationInstanceId; ///< &&& doc + std::string const _replicationAuthKey; ///< &&& doc + + /// Set to by the worker true if the czar was considered dead, and reset to false + /// after the czar has acknowledged successful reception of this message. + bool _thoughtCzarWasDead = false; + + mutable std::mutex _wciMtx; ///< protects all members. +}; + } // namespace lsst::qserv::http #endif // LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index d9f537711..2256de93a 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -43,7 +43,7 @@ using namespace lsst::qserv::http; BOOST_AUTO_TEST_SUITE(Suite) -BOOST_AUTO_TEST_CASE(CzarContactInfo) { +BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { string const replicationInstanceId = "repliInstId"; string const replicationAuthKey = "repliIAuthKey"; @@ -166,4 +166,52 @@ BOOST_AUTO_TEST_CASE(CzarContactInfo) { BOOST_REQUIRE(wqsdA->qIdDeadUberJobs.empty()); } +BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { + string const replicationInstanceId = "repliInstId"; + string const replicationAuthKey = "repliIAuthKey"; + + uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); + + string const czrName("czar_name"); + lsst::qserv::CzarIdType const czrId = 32; + int czrPort = 2022; + string const czrHost("cz_host"); + + auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); + LOGS_ERROR("&&&i a czarA=" << czarA->dump()); + auto czarAJs = czarA->serializeJson(); + LOGS_ERROR("&&&i b czarAJs=" << czarAJs); + + auto start = lsst::qserv::CLOCK::now(); + auto workerA = WorkerContactInfo::create("sd_workerA", "host_w1", "mgmhost_a", 3421, start); + LOGS_ERROR("&&&i d workerA=" << workerA->dump()); + auto jsWorkerA = workerA->serializeJson(); + LOGS_ERROR("&&&i e jsWorkerA=" << jsWorkerA); + + // WorkerCzarComIssue + //&&&auto wccIssueA = lsst::qserv::http::WorkerCzarComIssue::create(workerA, czarA, replicationInstanceId, + //replicationAuthKey); + auto wccIssueA = lsst::qserv::http::WorkerCzarComIssue::create(replicationInstanceId, replicationAuthKey); + wccIssueA->setContactInfo(workerA, czarA); + BOOST_REQUIRE(wccIssueA->needToSend() == false); + wccIssueA->setThoughtCzarWasDead(true); + BOOST_REQUIRE(wccIssueA->needToSend() == true); + + LOGS_ERROR("&&&i f wccIssue=" << wccIssueA->dump()); + + auto jsIssueA = wccIssueA->serializeJson(); + LOGS_ERROR("&&&i g jsIssue=" << *jsIssueA); + + auto wccIssueA1 = lsst::qserv::http::WorkerCzarComIssue::createFromJson(*jsIssueA, replicationInstanceId, + replicationAuthKey); + LOGS_ERROR("&&&i i wccIssueA1=" << wccIssueA1->dump()); + LOGS_ERROR("&&&i i wccIssueA=" << wccIssueA->dump()); + auto jsIssueA1 = wccIssueA1->serializeJson(); + LOGS_ERROR("&&&i i jsIssueA1=" << *jsIssueA1); + LOGS_ERROR("&&&i i jsIssueA=" << *jsIssueA); + BOOST_REQUIRE(*jsIssueA == *jsIssueA1); + + // &&& Test with items in lists. +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/qdisp/CzarStats.h b/src/qdisp/CzarStats.h index db80be7ec..c22eaf3d9 100644 --- a/src/qdisp/CzarStats.h +++ b/src/qdisp/CzarStats.h @@ -44,7 +44,6 @@ // Third party headers #include - namespace lsst::qserv::util { class QdispPool; } diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 821673d25..fa03d6c56 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -236,7 +236,7 @@ void Executive::queueJobStart(util::PriorityCommand::Ptr const& cmd) { } } -void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { // &&& put file collect in the pool ??? +void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { if (_scanInteractive) { _qdispPool->queCmd(cmd, 3); } else { @@ -245,7 +245,6 @@ void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { // &&& } void Executive::runUberJob(std::shared_ptr const& uberJob) { - auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); @@ -507,11 +506,40 @@ void Executive::_squashSuperfluous() { } void Executive::sendWorkersEndMsg(bool deleteResults) { - LOGS(_log, LOG_LVL_INFO, cName(__func__) << " terminating this query deleteResults=" - << deleteResults); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " terminating this query deleteResults=" << deleteResults); czar::Czar::getCzar()->getCzarRegistry()->endUserQueryOnWorkers(_id, deleteResults); } +void Executive::killIncompleteUberJobsOnWorker(std::string const& workerId) { + if (_cancelled) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " irrelevant as query already cancelled"); + return; + } + + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " killing incomplete UberJobs on " << workerId); + deque ujToCancel; + { + lock_guard lockUJMap(_uberJobsMapMtx); + for (auto const& [ujKey, ujPtr] : _uberJobsMap) { + auto ujStatus = ujPtr->getStatus()->getState(); + if (ujStatus != qmeta::JobStatus::RESPONSE_DONE && ujStatus != qmeta::JobStatus::COMPLETE) { + // RESPONSE_DONE indicates the result file has been read by + // the czar, so before that point the worker's data is + // likely destroyed. COMPLETE indicates all jobs in the + // UberJob are complete. + if (ujPtr->getWorkerContactInfo()->wId == workerId) { + ujToCancel.push_back(ujPtr); + } + } + } + } + + for (auto const& uj : ujToCancel) { + uj->killUberJob(); + uj->setStatusIfOk(qmeta::JobStatus::CANCEL, getIdStr() + " killIncomplete on worker=" + workerId); + } +} + int Executive::getNumInflight() const { unique_lock lock(_incompleteJobsMutex); return _incompleteJobs.size(); diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index c60261238..db02a9c43 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -81,7 +81,7 @@ namespace util { class AsyncTimer; class PriorityCommand; class QdispPool; -} +} // namespace util namespace qdisp { @@ -134,10 +134,10 @@ class Executive : public std::enable_shared_from_this { void runUberJob(std::shared_ptr const& uberJob); /// Queue a job to be sent to a worker so it can be started. - void queueJobStart(std::shared_ptr const& cmd); // &&& delete ??? + void queueJobStart(std::shared_ptr const& cmd); // &&& delete ??? /// Queue `cmd`, using the QDispPool, so it can be used to collect the result file. - void queueFileCollect(std::shared_ptr const& cmd); // &&& delete ??? + void queueFileCollect(std::shared_ptr const& cmd); // &&& delete ??? /// Waits for all jobs on _jobStartCmdList to start. This should not be called /// before ALL jobs have been added to the pool. @@ -153,6 +153,9 @@ class Executive : public std::enable_shared_from_this { /// Squash all the jobs. void squash(); + /// &&& doc + void killIncompleteUberJobsOnWorker(std::string const& workerId); + bool getEmpty() { return _empty; } /// These values cannot be set until information has been collected from @@ -265,7 +268,8 @@ class Executive : public std::enable_shared_from_this { /// How many jobs are used in this query. 1 avoids possible 0 of 0 jobs completed race condition. /// The correct value is set when it is available. std::atomic _totalJobs{1}; - std::shared_ptr _qdispPool; ///< Shared thread pool for handling commands to and from workers. + std::shared_ptr + _qdispPool; ///< Shared thread pool for handling commands to and from workers. std::deque> _jobStartCmdList; ///< list of jobs to start. diff --git a/src/qdisp/SharedResources.h b/src/qdisp/SharedResources.h index 6ca6eb8a3..0bfadcebf 100644 --- a/src/qdisp/SharedResources.h +++ b/src/qdisp/SharedResources.h @@ -25,7 +25,7 @@ // System headers #include -namespace lsst::qserv::util { // &&& delete +namespace lsst::qserv::util { // &&& delete class QdispPool; } @@ -35,8 +35,8 @@ namespace lsst::qserv::qdisp { /// the number of arguments passed. /// This class should be kept simple so it can easily be included in headers /// without undue compiler performances problems. - // &&& there's nothing in here but qdisppool!? Try to delete, but there - // &&& will probably be unit test issues. +// &&& there's nothing in here but qdisppool!? Try to delete, but there +// &&& will probably be unit test issues. class SharedResources { public: using Ptr = std::shared_ptr; @@ -50,7 +50,7 @@ class SharedResources { SharedResources& operator=(SharedResources const&) = delete; ~SharedResources() = default; - std::shared_ptr getQdispPool() { return _qdispPool; } //&&& delete + std::shared_ptr getQdispPool() { return _qdispPool; } //&&& delete private: SharedResources(std::shared_ptr const& qdispPool) : _qdispPool(qdispPool) {} diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 002d48085..fae46550e 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -492,8 +492,7 @@ void UberJob::killUberJob() { _unassignJobs(); // Let Czar::_monitor reassign jobs - other UberJobs are probably being killed - // so waiting probably gets a better distribution. If this is deemed to slow, - // then exec->assignJobsToUberJobs() could be called here. + // so waiting probably gets a better distribution. return; } diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 3a599c5c8..8069b4d3a 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -73,8 +73,6 @@ class UberJob : public JobBase { return _uberJobId; } // TODO:UJ change name when JobBase no longer needed. std::string const& getIdStr() const override { return _idStr; } - //&&&std::shared_ptr getQdispPool() override { return _qdispPool; } - //&&&std::shared_ptr getQdispPool() { return _qdispPool; } std::shared_ptr getRespHandler() override { return _respHandler; } std::shared_ptr getStatus() override { return _jobStatus; @@ -131,7 +129,7 @@ class UberJob : public JobBase { bool _setStatusIfOk(qmeta::JobStatus::State newState, std::string const& msg); /// unassign all Jobs in this UberJob and set the Executive flag to indicate that Jobs need - /// reassignment. + /// reassignment. The list of _jobs is cleared, so multiple calls of this should be harmless. void _unassignJobs(); /// Import and error from trying to collect results. diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index 0319f7646..56ca743ee 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -331,9 +331,7 @@ bool FileChannelShared::kill(string const& note) { return _kill(streamMutexLock, note); } -bool FileChannelShared::isDead() { - return _dead; -} +bool FileChannelShared::isDead() { return _dead; } string FileChannelShared::makeIdStr(int qId, int jId) { string str("QID" + (qId == 0 ? "" : to_string(qId) + "#" + to_string(jId))); diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index 8a53810a5..3098908eb 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -77,17 +77,17 @@ void UberJobData::setFileChannelShared(std::shared_ptr const& void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount, uint64_t fileSize, uint64_t headerCount) { - //&&&string const funcN = cName(__func__); LOGS(_log, LOG_LVL_TRACE, - cName(__func__) << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize - << " headerCount=" << headerCount); + cName(__func__) << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize + << " headerCount=" << headerCount); string workerIdStr; if (_foreman != nullptr) { workerIdStr = _foreman->chunkInventory()->id(); } else { workerIdStr = "dummyWorkerIdStr"; - LOGS(_log, LOG_LVL_INFO, cName(__func__) << " _foreman was null, which should only happen in unit tests"); + LOGS(_log, LOG_LVL_INFO, + cName(__func__) << " _foreman was null, which should only happen in unit tests"); } json request = {{"version", http::MetaModule::version}, @@ -111,7 +111,6 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& end"); } - bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const& task, bool cancelled) { LOGS(_log, LOG_LVL_INFO, cName(__func__)); @@ -125,8 +124,8 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptrgetChunkId()) + ": " + errorMsg; + errorMsg = cName(__func__) + " error(s) in result for chunk #" + to_string(task->getChunkId()) + + ": " + errorMsg; LOGS(_log, LOG_LVL_ERROR, errorMsg); } @@ -149,14 +148,17 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { +void UberJobData::_queueUJResponse(http::Method method_, std::vector const& headers_, + std::string const& url_, std::string const& requestContext_, + std::string const& requestStr_) { util::QdispPool::Ptr wPool; if (_foreman != nullptr) { wPool = _foreman->getWPool(); } LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd wPool=" << wPool); - auto cmdTransmit = UJTransmitCmd::create(_foreman, shared_from_this(), method_, headers_, url_, requestContext_, requestStr_); + auto cmdTransmit = UJTransmitCmd::create(_foreman, shared_from_this(), method_, headers_, url_, + requestContext_, requestStr_); LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& created UJTransmitCmd wPool=" << wPool); if (wPool == nullptr) { // No thread pool. Run the command now. This should only happen in unit tests. @@ -167,7 +169,7 @@ void UberJobData::_queueUJResponse(http::Method method_, std::vectorqueCmd(cmdTransmit, 0); - }else { + } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit_1"); wPool->queCmd(cmdTransmit, 1); } @@ -191,11 +193,22 @@ string UJTransmitCmd::cName(const char* funcN) const { } void UJTransmitCmd::action(util::CmdData* data) { + // Make certain _selfPtr is reset before leaving this function. + // If a retry is needed, duplicate() is called. + class ResetSelf { + public: + ResetSelf(UJTransmitCmd* ujtCmd) : _ujtCmd(ujtCmd) {} + ~ResetSelf() { _ujtCmd->_selfPtr.reset(); } + UJTransmitCmd* const _ujtCmd; + }; + ResetSelf resetSelf(this); + _attemptCount++; LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start attempt=" << _attemptCount); auto ujPtr = _ujData.lock(); if (ujPtr == nullptr || ujPtr->getCancelled()) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " UberJob was cancelled " << _attemptCount); + return; } http::Client client(_method, _url, _requestStr, _headers); bool transmitSuccess = false; @@ -203,14 +216,12 @@ void UJTransmitCmd::action(util::CmdData* data) { json const response = client.readAsJson(); if (0 != response.at("success").get()) { transmitSuccess = true; - _selfPtr.reset(); // clear so this can be deleted. } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); // There's no point in re-sending as the czar got the message and didn't like // it. // &&& maybe add this czId+ujId to a list of failed uberjobs that can be put // &&& status return??? Probably overkill. - _selfPtr.reset(); // clear so this can be deleted. } } catch (exception const& ex) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start d except"); @@ -221,10 +232,11 @@ void UJTransmitCmd::action(util::CmdData* data) { auto sPtr = _selfPtr; if (_foreman != nullptr && sPtr != nullptr) { // Do not reset _selfPtr as re-queuing may be needed several times. - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " no response for transmit, putting on failed transmit queue."); + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << " no response for transmit, putting on failed transmit queue."); auto wCzInfo = _foreman->getWCzarInfoMap()->getWCzarInfo(_czarId); - // This will check if the czar is believed to be alive and try the queue the query to be tried again - // at a lower priority. It it thinks the czar is dead, it will throw it away. + // This will check if the czar is believed to be alive and try the queue the query to be tried + // again at a lower priority. It it thinks the czar is dead, it will throw it away. // TODO:UJ &&& I have my doubts about this as a reconnected czar may go down in flames // &&& as it is hit with thousands of these. // &&& Alternate plan, set a flag in the status message response (WorkerQueryStatusData) @@ -235,22 +247,19 @@ void UJTransmitCmd::action(util::CmdData* data) { auto wPool = _foreman->getWPool(); if (wPool != nullptr) { Ptr replacement = duplicate(); - _selfPtr.reset(); if (replacement != nullptr) { wPool->queCmd(replacement, 2); } else { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " replacement was null"); } - } else{ + } else { // No thread pool, should only be possible in unit tests. LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " no wPool"); - _selfPtr.reset(); return; } } } else { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _selfPtr was null, assuming job killed."); - _selfPtr.reset(); // In case _foreman is null. } } LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start end"); @@ -261,7 +270,9 @@ void UJTransmitCmd::kill() { LOGS(_log, LOG_LVL_WARN, funcN); auto sPtr = _selfPtr; _selfPtr.reset(); - if (sPtr == nullptr) { return; } + if (sPtr == nullptr) { + return; + } // &&& TODO:UJ Is there anything that should be done here??? } @@ -273,7 +284,6 @@ UJTransmitCmd::Ptr UJTransmitCmd::duplicate() { Ptr newPtr = create(_foreman, ujD, _method, _headers, _url, _requestContext, _requestStr); newPtr->_attemptCount = _attemptCount; return newPtr; - } } // namespace lsst::qserv::wbase diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 1af32e511..0ccdf7c7e 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -58,7 +58,7 @@ class Task; /// This class tracks all Tasks associates with the UberJob on the worker /// and reports status to the czar. -class UberJobData : public std::enable_shared_from_this{ +class UberJobData : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; @@ -75,9 +75,7 @@ class UberJobData : public std::enable_shared_from_this{ /// Set file channel for this UberJob void setFileChannelShared(std::shared_ptr const& fileChannelShared); - void setScanInteractive(bool scanInteractive) { - _scanInteractive = scanInteractive; - } + void setScanInteractive(bool scanInteractive) { _scanInteractive = scanInteractive; } UberJobId getUberJobId() const { return _uberJobId; } qmeta::CzarId getCzarId() const { return _czarId; } @@ -113,8 +111,9 @@ class UberJobData : public std::enable_shared_from_this{ std::shared_ptr const& foreman, std::string const& authKey); /// &&& doc - void _queueUJResponse(http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_); - + void _queueUJResponse(http::Method method_, std::vector const& headers_, + std::string const& url_, std::string const& requestContext_, + std::string const& requestStr_); UberJobId const _uberJobId; std::string const _czarName; @@ -134,9 +133,9 @@ class UberJobData : public std::enable_shared_from_this{ std::string const _idStr; - std::atomic _scanInteractive; ///< &&& doc + std::atomic _scanInteractive; ///< &&& doc - std::atomic _cancelled{false}; ///< Set to true if this was cancelled. + std::atomic _cancelled{false}; ///< Set to true if this was cancelled. }; /// &&& doc @@ -149,37 +148,43 @@ class UJTransmitCmd : public util::PriorityCommand { std::string cName(const char* funcN) const; - /* &&& - static Ptr create(std::shared_ptr const& foreman_, CzarIdType czarId_, QueryId queryId_, UberJobId uberJobId_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { - auto ptr = Ptr(new UJTransmitCmd(foreman_, czarId_, queryId_, uberJobId_, method_, headers_, url_, requestContext_, requestStr_)); - ptr->_selfPtr = ptr; - return ptr; - } - */ - static Ptr create(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { - auto ptr = Ptr(new UJTransmitCmd(foreman_, ujData_, method_, headers_, url_, requestContext_, requestStr_)); + static Ptr create(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, + http::Method method_, std::vector const& headers_, std::string const& url_, + std::string const& requestContext_, std::string const& requestStr_) { + auto ptr = Ptr( + new UJTransmitCmd(foreman_, ujData_, method_, headers_, url_, requestContext_, requestStr_)); ptr->_selfPtr = ptr; return ptr; } - /// This is the function that will be run when the queue gets to this command. + /// Send the UberJob file to the czar, this is the function that will be run when + /// the queue reaches this command. If this message is not received by the czar, + /// it will notify WCzarInfo and possibly send WorkerCzarComIssue. void action(util::CmdData* data) override; /// Reset the self pointer so this object can be killed. void kill(); - /// &&& + /// &&& doc Ptr duplicate(); private: - /* &&& - UJTransmitCmd(std::shared_ptr const& foreman_, CzarIdType czarId_, QueryId queryId_, UberJobId uberJobId_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) - : PriorityCommand(), _foreman(foreman_), _czarId(czarId_), _queryId(queryId_), _uberJobId(uberJobId_), _method(method_), _headers(headers_), _url(url_), _requestContext(requestContext_), _requestStr(requestStr_) {} - */ - UJTransmitCmd(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) - : PriorityCommand(), _foreman(foreman_), _ujData(ujData_), _czarId(ujData_->getCzarId()), _queryId(ujData_->getQueryId()), _uberJobId(ujData_->getUberJobId()), _method(method_), _headers(headers_), _url(url_), _requestContext(requestContext_), _requestStr(requestStr_) {} - - Ptr _selfPtr; ///< So this object can put itself back on the queue and keep itself alive. + UJTransmitCmd(std::shared_ptr const& foreman_, UberJobData::Ptr const& ujData_, + http::Method method_, std::vector const& headers_, std::string const& url_, + std::string const& requestContext_, std::string const& requestStr_) + : PriorityCommand(), + _foreman(foreman_), + _ujData(ujData_), + _czarId(ujData_->getCzarId()), + _queryId(ujData_->getQueryId()), + _uberJobId(ujData_->getUberJobId()), + _method(method_), + _headers(headers_), + _url(url_), + _requestContext(requestContext_), + _requestStr(requestStr_) {} + + Ptr _selfPtr; ///< So this object can put itself back on the queue and keep itself alive. std::shared_ptr const _foreman; std::weak_ptr const _ujData; CzarIdType const _czarId; @@ -190,7 +195,7 @@ class UJTransmitCmd : public util::PriorityCommand { std::string const _url; std::string const _requestContext; std::string const _requestStr; - int _attemptCount = 0; ///< How many attempts have been made to transmit this. + int _attemptCount = 0; ///< How many attempts have been made to transmit this. util::InstanceCount _ic{cName("&&&")}; }; diff --git a/src/wbase/UserQueryInfo.cc b/src/wbase/UserQueryInfo.cc index ca1bacbc9..888180088 100644 --- a/src/wbase/UserQueryInfo.cc +++ b/src/wbase/UserQueryInfo.cc @@ -69,7 +69,6 @@ void UserQueryInfo::addUberJob(std::shared_ptr const& ujData) { _uberJobMap[ujId] = ujData; } - void UserQueryInfo::cancelFromCzar() { if (_cancelledByCzar.exchange(true)) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " already cancelledByCzar"); diff --git a/src/wbase/UserQueryInfo.h b/src/wbase/UserQueryInfo.h index 1266ebe05..0734f76c6 100644 --- a/src/wbase/UserQueryInfo.h +++ b/src/wbase/UserQueryInfo.h @@ -49,7 +49,9 @@ class UserQueryInfo { UserQueryInfo(UserQueryInfo const&) = delete; UserQueryInfo& operator=(UserQueryInfo const&) = delete; - static Ptr create(QueryId qId, CzarIdType czarId) { return std::shared_ptr(new UserQueryInfo(qId, czarId)); } + static Ptr create(QueryId qId, CzarIdType czarId) { + return std::shared_ptr(new UserQueryInfo(qId, czarId)); + } ~UserQueryInfo() = default; diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index ef56ba618..179022167 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -83,19 +83,19 @@ namespace lsst::qserv::wcontrol { Foreman::Ptr Foreman::_globalForeman; - -Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, - mysql::MySqlConfig const& mySqlConfig, wpublish::QueriesAndChunks::Ptr const& queries, - std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr) { +Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolSize, + unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, + wpublish::QueriesAndChunks::Ptr const& queries, + std::shared_ptr const& chunkInventory, + std::shared_ptr const& sqlConnMgr) { // Latch static std::atomic globalForemanSet{false}; if (globalForemanSet.exchange(true) == true) { throw util::Bug(ERR_LOC, "Foreman::create already an existing global Foreman."); } - Ptr fm = Ptr(new Foreman(scheduler, poolSize, maxPoolThreads, - mySqlConfig, queries, chunkInventory, sqlConnMgr)); + Ptr fm = Ptr(new Foreman(scheduler, poolSize, maxPoolThreads, mySqlConfig, queries, chunkInventory, + sqlConnMgr)); _globalForeman = fm; return _globalForeman; } @@ -140,19 +140,18 @@ Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigne string vectMinRunningSizesStr = _czarConfig->getQdispVectMinRunningSizes(); vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); */ - int qPoolSize = 50; // &&& TODO:UJ put in config - int maxPriority = 2; // &&& TODO:UJ put in config - string vectRunSizesStr = "10:10:10:10"; // &&& TODO:UJ put in config - vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); - string vectMinRunningSizesStr = "0:1:3:3"; // &&& TODO:UJ put in config - vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); + int qPoolSize = 50; // &&& TODO:UJ put in config + int maxPriority = 2; // &&& TODO:UJ put in config + string vectRunSizesStr = "10:10:10:10"; // &&& TODO:UJ put in config + vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); + string vectMinRunningSizesStr = "0:1:3:3"; // &&& TODO:UJ put in config + vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); LOGS(_log, LOG_LVL_INFO, "INFO wPool config qPoolSize=" << qPoolSize << " maxPriority=" << maxPriority << " vectRunSizes=" << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " << util::prettyCharList(vectMinRunningSizes)); - _wPool = - make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); + _wPool = make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); // Read-only access to the result files via the HTTP protocol's method "GET" // @@ -191,8 +190,6 @@ Foreman::~Foreman() { _httpServer->stop(); } -//&&& wpublish::QueryStatistics::Ptr Foreman::addQueryId(QueryId qId) { return _queries->addQueryId(qId); } - void Foreman::processTasks(vector const& tasks) { std::vector cmds; for (auto const& task : tasks) { diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index 93775dc0b..d00eed2a6 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -112,9 +112,10 @@ class Foreman { * @param sqlConnMgr - for limiting the number of MySQL connections used for tasks */ static Ptr create(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, - mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& queries, - std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr); + mysql::MySqlConfig const& mySqlConfig, + std::shared_ptr const& queries, + std::shared_ptr const& chunkInventory, + std::shared_ptr const& sqlConnMgr); ~Foreman(); @@ -135,11 +136,6 @@ class Foreman { /// Process a group of query processing tasks. void processTasks(std::vector> const& tasks); - /* &&& - /// &&& doc - std::shared_ptr addQueryId(QueryId qId); - */ - /// Implement the corresponding method of the base class nlohmann::json statusToJson(wbase::TaskSelector const& taskSelector); @@ -194,7 +190,7 @@ class Foreman { /// Map of czar information for all czars that have contacted this worker. std::shared_ptr const _wCzarInfoMap; - static Ptr _globalForeman; ///< Pointer to the global instance. + static Ptr _globalForeman; ///< Pointer to the global instance. }; } // namespace lsst::qserv::wcontrol diff --git a/src/wpublish/QueriesAndChunks.cc b/src/wpublish/QueriesAndChunks.cc index d492168f5..c49a09aa1 100644 --- a/src/wpublish/QueriesAndChunks.cc +++ b/src/wpublish/QueriesAndChunks.cc @@ -136,23 +136,8 @@ QueryStatistics::Ptr QueriesAndChunks::addQueryId(QueryId qId, CzarIdType czarId void QueriesAndChunks::addTask(wbase::Task::Ptr const& task) { auto qid = task->getQueryId(); auto czId = task->getCzarId(); -#if 0 // &&& delete upper block - unique_lock guardStats(_queryStatsMapMtx); - auto itr = _queryStatsMap.find(qid); - QueryStatistics::Ptr stats; - if (_queryStatsMap.end() == itr) { - stats = QueryStatistics::create(qid); - _queryStatsMap[qid] = stats; - throw util::Bug(ERR_LOC, "&&& QueriesAndChunks::addTask entry should already be there"); // &&& replace with error message ??? - } else { - stats = itr->second; - } - guardStats.unlock(); -#else // &&& auto stats = addQueryId(qid, czId); -#endif // &&& stats->addTask(task); - //&&&task->setQueryStatistics(stats); } /// Update statistics for the Task that was just queued. @@ -694,7 +679,7 @@ vector QueriesAndChunks::removeQueryFrom(QueryId const& qId, void QueriesAndChunks::killAllQueriesFromCzar(CzarIdType czarId) { std::map qsMap; { - lock_guard lgQsm(_queryStatsMapMtx); + lock_guard lgQsm(_queryStatsMapMtx); qsMap = _queryStatsMap; } diff --git a/src/wpublish/QueryStatistics.cc b/src/wpublish/QueryStatistics.cc index 8aacf1c8d..bc06eea58 100644 --- a/src/wpublish/QueryStatistics.cc +++ b/src/wpublish/QueryStatistics.cc @@ -51,7 +51,9 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wpublish.QueriesAndChunks"); namespace lsst::qserv::wpublish { QueryStatistics::QueryStatistics(QueryId qId_, CzarIdType czarId_) - : creationTime(CLOCK::now()), queryId(qId_), _userQueryInfo(wbase::UserQueryInfo::create(qId_, czarId_)) { + : creationTime(CLOCK::now()), + queryId(qId_), + _userQueryInfo(wbase::UserQueryInfo::create(qId_, czarId_)) { /// For all of the histograms, all entries should be kept at least until the work is finished. string qidStr = to_string(queryId); _histSizePerTask = util::Histogram::Ptr(new util::Histogram( diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index b5956cd7b..b4fb81da5 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -343,6 +343,18 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // Return a message containing lists of the queries that were cancelled. jsRet = wqsData->serializeResponseJson(foreman()->getWorkerStartupTime()); + + // &&& queue sending WorkerCzarComIssue if needed. + /* &&& + auto const wczComIssue = wCzarInfo->getWorkerCzarComIssue(); + if (wczComIssue != nullptr && wczComIssue->needToSend()) { + LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE to queue wczComIssue message, do not queue more than one at a + time."); + // Limit the sending to happening after czar sends status + } + */ + wCzarInfo->sendWorkerCzarComIssueIfNeeded(wqsData->getWInfo(), wqsData->getCzInfo()); + return jsRet; } diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index aa707551b..7aa2ef85b 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -236,12 +236,8 @@ SsiService::SsiService(XrdSsiLogger* log) { LOGS(_log, LOG_LVL_WARN, "config sqlConnMgr" << *sqlConnMgr); LOGS(_log, LOG_LVL_WARN, "maxPoolThreads=" << maxPoolThreads); - /* &&& - _foreman = wcontrol::Foreman::Ptr(new wcontrol::Foreman(blendSched, poolSize, maxPoolThreads, mySqlConfig, - queries, ::makeChunkInventory(mySqlConfig), - sqlConnMgr)); - */ - _foreman = wcontrol::Foreman::create(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, ::makeChunkInventory(mySqlConfig), sqlConnMgr); + _foreman = wcontrol::Foreman::create(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, + ::makeChunkInventory(mySqlConfig), sqlConnMgr); // Watch to see if the log configuration is changed. // If LSST_LOG_CONFIG is not defined, there's no good way to know what log From 0cff54a8d072a6144182a916451cdb022b678d0e Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 4 Oct 2024 13:49:31 -0700 Subject: [PATCH 09/22] Added dead message handling. --- src/cconfig/CzarConfig.cc | 7 +- src/cconfig/CzarConfig.h | 5 +- src/ccontrol/UserQuerySelect.cc | 67 ++++++++++--------- src/czar/ActiveWorker.cc | 28 ++++---- src/czar/ActiveWorker.h | 15 ++--- src/czar/Czar.cc | 28 ++++---- src/czar/CzarChunkMap.cc | 102 +++++++++++++++++++++-------- src/czar/CzarChunkMap.h | 22 ++++++- src/czar/CzarRegistry.cc | 27 ++++---- src/czar/CzarRegistry.h | 19 +++--- src/czar/HttpCzarWorkerModule.cc | 6 ++ src/http/WorkerQueryStatusData.cc | 32 +-------- src/http/WorkerQueryStatusData.h | 19 ++++-- src/http/testStatusData.cc | 40 ++--------- src/qdisp/Executive.cc | 12 +--- src/qdisp/Executive.h | 28 +------- src/qdisp/JobBase.h | 3 - src/qdisp/JobDescription.cc | 7 ++ src/qdisp/JobQuery.cc | 11 ++-- src/qdisp/JobQuery.h | 14 +--- src/wbase/Task.cc | 37 +---------- src/wbase/Task.h | 1 - src/wbase/UberJobData.cc | 13 ---- src/wbase/UberJobData.h | 2 +- src/wconfig/WorkerConfig.h | 9 --- src/wcontrol/Foreman.cc | 8 --- src/wdb/ChunkResource.cc | 1 - src/wdb/QueryRunner.cc | 13 ---- src/wdb/QueryRunner.h | 4 -- src/wpublish/QueryStatistics.cc | 7 -- src/wpublish/QueryStatistics.h | 2 - src/xrdsvc/HttpReplicaMgtModule.cc | 10 --- src/xrdsvc/HttpWorkerCzarModule.cc | 10 --- src/xrdsvc/SsiService.cc | 10 --- 34 files changed, 240 insertions(+), 379 deletions(-) diff --git a/src/cconfig/CzarConfig.cc b/src/cconfig/CzarConfig.cc index 68f24f092..5962af9e5 100644 --- a/src/cconfig/CzarConfig.cc +++ b/src/cconfig/CzarConfig.cc @@ -62,10 +62,9 @@ namespace lsst::qserv::cconfig { std::mutex CzarConfig::_mtxOnInstance; -std::shared_ptr CzarConfig::_instance; +CzarConfig::Ptr CzarConfig::_instance; -std::shared_ptr CzarConfig::create(std::string const& configFileName, - std::string const& czarName) { +CzarConfig::Ptr CzarConfig::create(std::string const& configFileName, std::string const& czarName) { std::lock_guard const lock(_mtxOnInstance); if (_instance == nullptr) { _instance = std::shared_ptr(new CzarConfig(util::ConfigStore(configFileName), czarName)); @@ -73,7 +72,7 @@ std::shared_ptr CzarConfig::create(std::string const& configFileName return _instance; } -std::shared_ptr CzarConfig::instance() { +CzarConfig::Ptr CzarConfig::instance() { std::lock_guard const lock(_mtxOnInstance); if (_instance == nullptr) { throw std::logic_error("CzarConfig::" + std::string(__func__) + ": instance has not been created."); diff --git a/src/cconfig/CzarConfig.h b/src/cconfig/CzarConfig.h index d55183177..6fd1ed0da 100644 --- a/src/cconfig/CzarConfig.h +++ b/src/cconfig/CzarConfig.h @@ -53,6 +53,7 @@ namespace lsst::qserv::cconfig { */ class CzarConfig { public: + using Ptr = std::shared_ptr; /** * Create an instance of CzarConfig and load parameters from the specifid file. * @note One has to call this method at least once before trying to obtain @@ -63,7 +64,7 @@ class CzarConfig { * @param czarName - the unique name of Czar. * @return the shared pointer to the configuration object */ - static std::shared_ptr create(std::string const& configFileName, std::string const& czarName); + static Ptr create(std::string const& configFileName, std::string const& czarName); /** * Get a pointer to an instance that was created by the last call to @@ -71,7 +72,7 @@ class CzarConfig { * @return the shared pointer to the configuration object * @throws std::logic_error when attempting to call the bethod before creating an instance. */ - static std::shared_ptr instance(); + static Ptr instance(); CzarConfig() = delete; CzarConfig(CzarConfig const&) = delete; diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 6ed20d896..beef84f21 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -303,16 +303,15 @@ void UserQuerySelect::submit() { } /// At this point the executive has a map of all jobs with the chunkIds as the key. - if (uberJobsEnabled) { - // TODO:UJ _maxCHunksPerUberJob maybe put in config??? or set on command line?? - // Different queries may benefit from different values - // Such as LIMIT=1 may work best with this at 1, where - // 100 would be better for others. - _maxChunksPerUberJob = 2; - // This is needed to prevent Czar::_monitor from starting things before they are ready. - _executive->setReadyToExecute(); - buildAndSendUberJobs(); - } + // TODO:UJ _maxCHunksPerUberJob maybe put in config??? or set on command line?? + // Different queries may benefit from different values + // Such as LIMIT=1 may work best with this at 1, where + // 100 would be better for others. + // &&& + _maxChunksPerUberJob = 2; + // This is needed to prevent Czar::_monitor from starting things before they are ready. + _executive->setReadyToExecute(); + buildAndSendUberJobs(); LOGS(_log, LOG_LVL_DEBUG, "total jobs in query=" << sequence); // TODO:UJ Waiting for all jobs to start may not be needed anymore? @@ -326,7 +325,6 @@ void UserQuerySelect::submit() { } void UserQuerySelect::buildAndSendUberJobs() { - // &&& NEED CODE - this function should check if the worker is DEAD. TODO:UJ string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); LOGS(_log, LOG_LVL_DEBUG, funcN << " start"); @@ -376,52 +374,59 @@ void UserQuerySelect::buildAndSendUberJobs() { // - For failures - If a worker cannot be contacted, that's an uberjob failure. // - uberjob failures (due to communications problems) will result in the uberjob // being broken up into multiple UberJobs going to different workers. - // - The best way to do this is probably to just kill the UberJob and mark all - // Jobs that were in that UberJob as needing re-assignment, and re-running - // the code here. The trick is going to be figuring out which workers are alive. - // Maybe force a fresh lookup from the replicator Registry when an UberJob fails. + // - If an UberJob fails, the UberJob is killed and all the Jobs it contained + // are flagged as needing re-assignment and this function will be called + // again to put those Jobs in new UberJobs. Correctly re-assigning the + // Jobs requires accurate information from the registry about which workers + // are alive or dead. map> workerJobMap; vector missingChunks; // unassignedChunksInQuery needs to be in numerical order so that UberJobs contain chunk numbers in // numerical order. The workers run shared scans in numerical order of chunk id numbers. - // This keeps the number of partially complete UberJobs running on a worker to a minimum, + // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, // and should minimize the time for the first UberJob on the worker to complete. for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) { - auto iter = chunkMapPtr->find(chunkId); - if (iter == chunkMapPtr->end()) { + // If too many workers are down, there will be a chunk that cannot be found. + // Just continuing should leave jobs `unassigned` with their attempt count + // increased. Either the chunk will be found and jobs assigned, or the jobs' + // attempt count will reach max and the query will be cancelled + auto lambdaMissingChunk = [&](string const& msg) { missingChunks.push_back(chunkId); bool const increaseAttemptCount = true; jqPtr->getDescription()->incrAttemptCountScrubResultsJson(_executive, increaseAttemptCount); - // Assign as many jobs as possible. Any chunks not found will be attempted later. + LOGS(_log, LOG_LVL_ERROR, msg); + }; + + auto iter = chunkMapPtr->find(chunkId); + if (iter == chunkMapPtr->end()) { + lambdaMissingChunk(funcN + " No chunkData for=" + to_string(chunkId)); continue; } czar::CzarChunkMap::ChunkData::Ptr chunkData = iter->second; auto targetWorker = chunkData->getPrimaryScanWorker().lock(); - // TODO:UJ maybe if (targetWorker == nullptr || this worker already tried for this chunk) { - if (targetWorker == nullptr) { - LOGS(_log, LOG_LVL_ERROR, funcN << " No primary scan worker for chunk=" << chunkData->dump()); + // TODO:UJ maybe if (targetWorker == nullptr || ... || this worker already tried for this chunk) { + if (targetWorker == nullptr || targetWorker->isDead()) { + LOGS(_log, LOG_LVL_WARN, + funcN << " No primary scan worker for chunk=" + chunkData->dump() + << ((targetWorker == nullptr) ? " targ was null" : " targ was dead")); // Try to assign a different worker to this job auto workerHasThisChunkMap = chunkData->getWorkerHasThisMapCopy(); bool found = false; for (auto wIter = workerHasThisChunkMap.begin(); wIter != workerHasThisChunkMap.end() && !found; ++wIter) { auto maybeTarg = wIter->second.lock(); - if (maybeTarg != nullptr) { + if (maybeTarg != nullptr && !maybeTarg->isDead()) { targetWorker = maybeTarg; found = true; LOGS(_log, LOG_LVL_WARN, - funcN << " Alternate worker found for chunk=" << chunkData->dump()); + funcN << " Alternate worker=" << targetWorker->getWorkerId() + << " found for chunk=" << chunkData->dump()); } } if (!found) { - // If too many workers are down, there will be a chunk that cannot be found. - // Just continuing should leave jobs `unassigned` with their attempt count - // increased. Either the chunk will be found and jobs assigned, or the jobs' - // attempt count will reach max and the query will be cancelled - // TODO:UJ Needs testing/verification - LOGS(_log, LOG_LVL_ERROR, - funcN << " No primary or alternate worker found for chunk=" << chunkData->dump()); + lambdaMissingChunk(funcN + + " No primary or alternate worker found for chunk=" + chunkData->dump()); continue; } } diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index ed0e445c5..921d678a7 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -115,7 +115,6 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti _changeStateTo(ALIVE, secsSinceUpdate, cName(__func__)); } else { // Don't waste time on this worker until the registry has heard from it. - // &&& If it's been a really really long time, maybe delete this entry ??? return; } break; @@ -132,20 +131,6 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti shared_ptr jsWorkerReqPtr; { - lock_guard lg(_aMtx); //&&& needed ??? - lock_guard mapLg(_wqsData->mapMtx); - // Check how many messages are currently being sent to the worker, if at the limit, return - if (_wqsData->qIdDoneKeepFiles.empty() && _wqsData->qIdDoneDeleteFiles.empty() && - _wqsData->qIdDeadUberJobs.empty()) { - return; - } - int tCount = _conThreadCount; - if (tCount > _maxConThreadCount) { - LOGS(_log, LOG_LVL_DEBUG, - cName(__func__) << " not sending message since at max threads " << tCount); - return; - } - // Go through the _qIdDoneKeepFiles, _qIdDoneDeleteFiles, and _qIdDeadUberJobs lists to build a // message to send to the worker. jsWorkerReqPtr = _wqsData->serializeJson(maxLifetime); @@ -217,6 +202,17 @@ void ActiveWorker::addDeadUberJob(QueryId qId, UberJobId ujId) { _wqsData->addDeadUberJob(qId, ujId, now); } +http::WorkerContactInfo::Ptr ActiveWorker::getWInfo() const { + std::lock_guard lg(_aMtx); + if (_wqsData == nullptr) return nullptr; + return _wqsData->getWInfo(); +} + +ActiveWorker::State ActiveWorker::getState() const { + std::lock_guard lg(_aMtx); + return _state; +} + string ActiveWorker::dump() const { lock_guard lg(_aMtx); return _dump(); @@ -238,6 +234,7 @@ void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, auto iter = _awMap.find(wcKey); if (iter == _awMap.end()) { auto newAW = ActiveWorker::create(wcVal, czInfo, replicationInstanceId, replicationAuthKey); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " AciveWorker created for " << wcKey); _awMap[wcKey] = newAW; if (_czarCancelAfterRestart) { newAW->setCzarCancelAfterRestart(_czarCancelAfterRestartCzId, _czarCancelAfterRestartQId); @@ -252,6 +249,7 @@ void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, // If there is existing information, only host and port values will change. aWorker->setWorkerContactInfo(wcVal); } + aWorker->getWInfo()->setRegUpdateTime(wcVal->getRegUpdateTime()); } } } diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 0ddc1f9d5..630a10eae 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -79,7 +79,8 @@ class ActiveWorker : public std::enable_shared_from_this { ActiveWorker& operator=(ActiveWorker const&) = delete; std::string cName(const char* fName) { - return std::string("ActiveWorker::") + fName + " " + ((_wqsData == nullptr) ? "?" : _wqsData->dump()); + auto wqsd = _wqsData; + return std::string("ActiveWorker::") + fName + " " + ((wqsd == nullptr) ? "?" : wqsd->dump()); } static std::string getStateStr(State st); @@ -97,10 +98,7 @@ class ActiveWorker : public std::enable_shared_from_this { _wqsData->setCzarCancelAfterRestart(czId, lastQId); } - http::WorkerContactInfo::Ptr getWInfo() const { - if (_wqsData == nullptr) return nullptr; - return _wqsData->getWInfo(); - } + http::WorkerContactInfo::Ptr getWInfo() const; ~ActiveWorker() = default; @@ -138,6 +136,8 @@ class ActiveWorker : public std::enable_shared_from_this { /// individual UberJobs anymore, so this function will get rid of them. void removeDeadUberJobsFor(QueryId qId); + State getState() const; + std::string dump() const; private: @@ -169,10 +169,6 @@ class ActiveWorker : public std::enable_shared_from_this { State _state{QUESTIONABLE}; ///< current state of this worker. mutable std::mutex _aMtx; ///< protects _wInfo, _state, _qIdDoneKeepFiles, _qIdDoneDeleteFiles - - /// The number of communication threads currently in use by this class instance. - std::atomic _conThreadCount{0}; - int _maxConThreadCount{2}; }; /// &&& doc @@ -182,6 +178,7 @@ class ActiveWorker : public std::enable_shared_from_this { /// come back from the dead. class ActiveWorkerMap { public: + using Ptr = std::shared_ptr; ActiveWorkerMap() = default; ActiveWorkerMap(ActiveWorkerMap const&) = delete; ActiveWorkerMap operator=(ActiveWorkerMap const&) = delete; diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index c2df9e545..c05400ab7 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -95,10 +95,17 @@ void Czar::_monitor() { LOGS(_log, LOG_LVL_DEBUG, funcN << " start0"); /// Check database for changes in worker chunk assignments and aliveness - _czarFamilyMap->read(); + try { + _czarFamilyMap->read(); + } catch (ChunkMapException const& cmex) { + // There are probably chunks that don't exist on any alive worker, + // continue on in hopes that workers will show up with the missing chunks + // later. + LOGS(_log, LOG_LVL_ERROR, funcN << " family map read problems " << cmex.what()); + } // Send appropriate messages to all ActiveWorkers. This will - // check if workers have died by timeout. The reponse + // check if workers have died by timeout. The response // from the worker include _czarRegistry->sendActiveWorkersMessages(); @@ -126,14 +133,13 @@ void Czar::_monitor() { execVal->assignJobsToUberJobs(); } - // TODO:UJ DM-45470 Maybe get missing results from workers. - // To prevent anything from slipping through the cracks: - // Workers will keep trying to transmit results until they think the czar is dead. - // If a worker thinks the czar died, it will cancel all related jobs that it has, - // and if the czar sends a status message to that worker, that worker will send back - // a separate message saying it killed everything that this czar gave it. Upon - // getting this message from a worker, this czar will reassign everything it had - // sent to that worker. + // To prevent anything from slipping through the cracks: + // Workers will keep trying to transmit results until they think the czar is dead. + // If a worker thinks the czar died, it will cancel all related jobs that it has, + // and if the czar sends a status message to that worker, that worker will send back + // a separate message (see WorkerCzarComIssue) saying it killed everything that this + // czar gave it. Upon getting this message from a worker, this czar will reassign + // everything it had sent to that worker. // TODO:UJ How long should queryId's remain on this list? } @@ -229,7 +235,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) auto const port = _controlHttpSvc->start(); _czarConfig->setReplicationHttpPort(port); - _czarRegistry = CzarRegistry::create(_czarConfig); + _czarRegistry = CzarRegistry::create(_czarConfig, _activeWorkerMap); // Start the monitor thread thread monitorThrd(&Czar::_monitor, this); diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 7dd1e407a..11f7865d1 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -84,20 +84,22 @@ void CzarChunkMap::verify() { for (auto const& [chunkId, chunkDataPtr] : chunkMap) { if (chunkDataPtr == nullptr) { - LOGS(_log, LOG_LVL_ERROR, " chunkId=" << chunkId << " had nullptr"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " chunkId=" << chunkId << " had nullptr"); ++errorCount; continue; } auto primeScanWkr = chunkDataPtr->_primaryScanWorker.lock(); if (primeScanWkr == nullptr) { - LOGS(_log, LOG_LVL_ERROR, " chunkId=" << chunkId << " missing primaryScanWorker"); + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " chunkId=" << chunkId << " missing primaryScanWorker"); ++errorCount; continue; } if (primeScanWkr->_sharedScanChunkMap.find(chunkId) == primeScanWkr->_sharedScanChunkMap.end()) { LOGS(_log, LOG_LVL_ERROR, - " chunkId=" << chunkId << " should have been (and was not) in the sharedScanChunkMap for " - << primeScanWkr->_workerId); + cName(__func__) << " chunkId=" << chunkId + << " should have been (and was not) in the sharedScanChunkMap for " + << primeScanWkr->_workerId); ++errorCount; continue; } @@ -105,7 +107,8 @@ void CzarChunkMap::verify() { if (iter != allChunkIds.end()) { allChunkIds.erase(iter); } else { - LOGS(_log, LOG_LVL_ERROR, " chunkId=" << chunkId << " chunkId was not in allChunks list"); + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " chunkId=" << chunkId << " chunkId was not in allChunks list"); ++errorCount; continue; } @@ -118,14 +121,14 @@ void CzarChunkMap::verify() { allMissingIds += to_string(cId) + ","; } LOGS(_log, LOG_LVL_ERROR, - " There were " << missing << " missing chunks from the scan list " << allMissingIds); + cName(__func__) << " There were " << missing << " missing chunks from the scan list " + << allMissingIds); ++errorCount; } if (errorCount > 0) { - // TODO:UJ There may be an argument to keep the new maps even if there are problems - // with them. For current testing, it's probably best to leave it how it is so that - // it's easier to isolate problems. + // Original creation of the family map will keep re-reading until there are no problems. + // _monitor will log this and keep using the old maps. throw ChunkMapException(ERR_LOC, "verification failed with " + to_string(errorCount) + " errors"); } } @@ -161,20 +164,21 @@ void CzarChunkMap::ChunkData::_calcTotalBytes() { void CzarChunkMap::ChunkData::addToWorkerHasThis(std::shared_ptr const& worker) { if (worker == nullptr) { - throw ChunkMapException(ERR_LOC, string(__func__) + " worker was null"); + throw ChunkMapException(ERR_LOC, cName(__func__) + " worker was null"); } _workerHasThisMap[worker->_workerId] = worker; } -std::map> -CzarChunkMap::ChunkData::getWorkerHasThisMapCopy() const { - std::map> newMap = _workerHasThisMap; +map> CzarChunkMap::ChunkData::getWorkerHasThisMapCopy() + const { + map> newMap = _workerHasThisMap; return newMap; } -void CzarChunkMap::organize() { +shared_ptr CzarChunkMap::organize() { auto chunksSortedBySize = make_shared(); + auto missingChunks = make_shared(); calcChunkMap(*_chunkMap, *chunksSortedBySize); @@ -191,27 +195,31 @@ void CzarChunkMap::organize() { for (auto&& [wkrId, wkrDataWeak] : chunkData->_workerHasThisMap) { auto wkrData = wkrDataWeak.lock(); if (wkrData == nullptr) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " unexpected null weak ptr for " << wkrId); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " unexpected null weak ptr for " << wkrId); continue; // maybe the next one will be okay. } + LOGS(_log, LOG_LVL_DEBUG, - __func__ << " wkrId=" << wkrData << " tsz=" << wkrData->_sharedScanTotalSize - << " smallest=" << smallest); + cName(__func__) << " wkrId=" << wkrData << " tsz=" << wkrData->_sharedScanTotalSize + << " smallest=" << smallest); if (wkrData->_sharedScanTotalSize < smallest) { smallestWkr = wkrData; smallest = smallestWkr->_sharedScanTotalSize; } } if (smallestWkr == nullptr) { - throw ChunkMapException(ERR_LOC, string(__func__) + " no smallesWkr found for chunk=" + - to_string(chunkData->_chunkId)); + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) + " no smallesWkr found for chunk=" + to_string(chunkData->_chunkId)); + missingChunks->push_back(chunkData); + } else { + smallestWkr->_sharedScanChunkMap[chunkData->_chunkId] = chunkData; + smallestWkr->_sharedScanTotalSize += chunkData->_totalBytes; + chunkData->_primaryScanWorker = smallestWkr; + LOGS(_log, LOG_LVL_DEBUG, + " chunk=" << chunkData->_chunkId << " assigned to scan on " << smallestWkr->_workerId); } - smallestWkr->_sharedScanChunkMap[chunkData->_chunkId] = chunkData; - smallestWkr->_sharedScanTotalSize += chunkData->_totalBytes; - chunkData->_primaryScanWorker = smallestWkr; - LOGS(_log, LOG_LVL_DEBUG, - " chunk=" << chunkData->_chunkId << " assigned to scan on " << smallestWkr->_workerId); } + return missingChunks; } string CzarChunkMap::ChunkData::dump() const { @@ -231,6 +239,34 @@ string CzarChunkMap::ChunkData::dump() const { return os.str(); } +bool CzarChunkMap::WorkerChunksData::isDead() { + if (_activeWorker == nullptr) { + // At startup, these may not be available + auto czarPtr = Czar::getCzar(); + if (czarPtr == nullptr) { + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " czarPtr is null, this should only hap[pen in unit test."); + return false; + } + auto awMap = Czar::getCzar()->getActiveWorkerMap(); + if (awMap == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " awMap is null."); + return true; + } + _activeWorker = awMap->getActiveWorker(_workerId); + if (_activeWorker == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " activeWorker not found."); + return true; + } + } + auto wState = _activeWorker->getState(); + bool res = wState == ActiveWorker::DEAD; + if (!res) { + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " is dead"); + } + return res; +} + string CzarChunkMap::WorkerChunksData::dump() const { stringstream os; os << "{WorkerChunksData id=" << _workerId << " scanTotalSize=" << _sharedScanTotalSize; @@ -300,9 +336,6 @@ bool CzarFamilyMap::_read() { return false; } - // &&& TODO:UJ Before makeNewMaps(), get a list of workers considered to be alive by - // czar::_activeWorkerMap - // Make the new maps. shared_ptr familyMapPtr = makeNewMaps(qChunkMap); @@ -356,7 +389,20 @@ std::shared_ptr CzarFamilyMap::makeNewMaps( // this needs to be done for each CzarChunkMap in the family map. for (auto&& [familyName, chunkMapPtr] : *newFamilyMap) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " working on " << familyName); - chunkMapPtr->organize(); + auto missing = chunkMapPtr->organize(); + if (missing != nullptr && !missing->empty()) { + // TODO:UJ Some element of the dashboard should be made aware of this. Also, + // TODO:UJ maybe this should check all families before throwing. + // TODO:UJ There are implications that maybe the replicator should not + // TODO:UJ tell the czar about families/databases that do not have + // TODO:UJ at least one copy of each chunk with data loaded on a worker. + string chunkIdStr; + for (auto const& chunkData : *missing) { + chunkIdStr += to_string(chunkData->getChunkId()) + " "; + } + throw ChunkMapException( + ERR_LOC, cName(__func__) + " family=" + familyName + " is missing chunks " + chunkIdStr); + } } return newFamilyMap; diff --git a/src/czar/CzarChunkMap.h b/src/czar/CzarChunkMap.h index f0b85a1d3..97e864855 100644 --- a/src/czar/CzarChunkMap.h +++ b/src/czar/CzarChunkMap.h @@ -43,6 +43,7 @@ struct QMetaChunkMap; namespace lsst::qserv::czar { +class ActiveWorker; class CzarFamilyMap; class ChunkMapException : public util::Issue { @@ -71,10 +72,11 @@ class CzarChunkMap { using Ptr = std::shared_ptr; using SizeT = uint64_t; + std::string cName(const char* func) { return std::string("CzarChunkMap::") + func; } + CzarChunkMap(CzarChunkMap const&) = delete; CzarChunkMap& operator=(CzarChunkMap const&) = delete; - // static Ptr create(std::shared_ptr const& qmeta) { return Ptr(new CzarChunkMap(qmeta)); } static Ptr create() { return Ptr(new CzarChunkMap()); } ~CzarChunkMap(); @@ -88,8 +90,10 @@ class CzarChunkMap { using Ptr = std::shared_ptr; ChunkData(int chunkId_) : _chunkId(chunkId_) {} + std::string cName(const char* func) { + return std::string("ChunkData::") + func + " " + std::to_string(_chunkId); + } int64_t getChunkId() const { return _chunkId; } - SizeT getTotalBytes() const { return _totalBytes; } std::weak_ptr getPrimaryScanWorker() const { return _primaryScanWorker; } @@ -127,6 +131,10 @@ class CzarChunkMap { using Ptr = std::shared_ptr; WorkerChunksData(std::string const& workerId) : _workerId(workerId) {} + std::string cName(const char* func) { + return std::string("WorkerChunksData::") + func + " " + _workerId; + } + /// Return the worker's id string. std::string const& getWorkerId() const { return _workerId; } @@ -134,6 +142,9 @@ class CzarChunkMap { /// accessed in a full table scan on this worker. SizeT getSharedScanTotalSize() const { return _sharedScanTotalSize; } + /// &&& doc + bool isDead(); + /// Return a reference to `_sharedScanChunkMap`. A copy of the pointer /// to this class (or the containing map) should be held to ensure the reference. std::map const& getSharedScanChunkMap() const { return _sharedScanChunkMap; } @@ -159,6 +170,10 @@ class CzarChunkMap { /// The total size (in bytes) of all chunks on this worker that /// are to be used in shared scans. SizeT _sharedScanTotalSize = 0; + + /// Used to determine if this worker is alive and set + /// when the test is made. + std::shared_ptr _activeWorker; }; using WorkerChunkMap = std::map; @@ -192,7 +207,8 @@ class CzarChunkMap { /// Use the information from the registry to `organize` `_chunkMap` and `_workerChunkMap` /// into their expected formats. - void organize(); + /// @return a vector of ChunkData::Ptr of chunks where no worker was found. + std::shared_ptr organize(); private: CzarChunkMap(); diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 6f9275b71..72b845001 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -48,7 +48,9 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.CzarRegistry"); namespace lsst::qserv::czar { -CzarRegistry::CzarRegistry(std::shared_ptr const& czarConfig) : _czarConfig(czarConfig) { +CzarRegistry::CzarRegistry(cconfig::CzarConfig::Ptr const& czarConfig, + ActiveWorkerMap::Ptr const& activeWorkerMap) + : _czarConfig(czarConfig), _activeWorkerMap(activeWorkerMap) { // Begin periodically updating worker's status in the Replication System's registry. // This will continue until the application gets terminated. thread registryUpdateThread(&CzarRegistry::_registryUpdateLoop, this); @@ -68,6 +70,11 @@ CzarRegistry::~CzarRegistry() { } } +http::WorkerContactInfo::WCMapPtr CzarRegistry::getWorkerContactMap() const { + std::lock_guard lockG(_cmapMtx); + return _contactMap; +} + void CzarRegistry::_registryUpdateLoop() { auto const method = http::Method::POST; string const url = "http://" + _czarConfig->replicationRegistryHost() + ":" + @@ -129,12 +136,12 @@ void CzarRegistry::_registryWorkerInfoLoop() { auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), util::get_current_host_fqdn(), czarStartTime); - lock_guard lck(_mapMtx); + lock_guard lck(_cmapMtx); if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { _contactMap = wMap; _latestMapUpdate = CLOCK::now(); - _activeWorkerMap.updateMap(*_contactMap, czInfo, replicationInstanceId, - replicationAuthKey); + _activeWorkerMap->updateMap(*_contactMap, czInfo, replicationInstanceId, + replicationAuthKey); } } } @@ -198,7 +205,7 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const http::WorkerContactInfo::WCMapPtr contMap = nullptr; while (contMap == nullptr) { { - std::lock_guard lockG(_mapMtx); + std::lock_guard lockG(_cmapMtx); contMap = _contactMap; } if (contMap == nullptr) { @@ -212,21 +219,19 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const void CzarRegistry::sendActiveWorkersMessages() { // Send messages to each active worker as needed - lock_guard lck(_mapMtx); - _activeWorkerMap.sendActiveWorkersMessages(); + _activeWorkerMap->sendActiveWorkersMessages(); } void CzarRegistry::endUserQueryOnWorkers(QueryId qId, bool deleteWorkerResults) { - lock_guard lck(_mapMtx); // Add query id to the appropriate list. if (deleteWorkerResults) { - _activeWorkerMap.addToDoneDeleteFiles(qId); + _activeWorkerMap->addToDoneDeleteFiles(qId); } else { - _activeWorkerMap.addToDoneKeepFiles(qId); + _activeWorkerMap->addToDoneKeepFiles(qId); } // With lists updated, send out messages. - _activeWorkerMap.sendActiveWorkersMessages(); + _activeWorkerMap->sendActiveWorkersMessages(); } } // namespace lsst::qserv::czar diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index 076f7fd40..bc8b6dc6d 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -61,18 +61,16 @@ class CzarRegistry { using Ptr = std::shared_ptr; /// Return a pointer to a new CzarRegistry object. - static Ptr create(std::shared_ptr const& czarConfig) { - return Ptr(new CzarRegistry(czarConfig)); + static Ptr create(std::shared_ptr const& czarConfig, + std::shared_ptr const& activeWorkerMap) { + return Ptr(new CzarRegistry(czarConfig, activeWorkerMap)); } ~CzarRegistry(); /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. - http::WorkerContactInfo::WCMapPtr getWorkerContactMap() const { - std::lock_guard lockG(_mapMtx); - return _contactMap; - } + http::WorkerContactInfo::WCMapPtr getWorkerContactMap() const; /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. This @@ -89,7 +87,8 @@ class CzarRegistry { private: CzarRegistry() = delete; - CzarRegistry(std::shared_ptr const& czarConfig); + CzarRegistry(std::shared_ptr const& czarConfig, + std::shared_ptr const& activeWorkerMap); /// This function will keep periodically updating Czar's info in the Replication System's Registry /// until _loop is set to false. @@ -105,6 +104,7 @@ class CzarRegistry { http::WorkerContactInfo::WCMapPtr _buildMapFromJson(nlohmann::json const& response); /// Return true if maps are the same size and all of the elements have the same contact info. + /// NOTE: _cmapMtx must be held when calling. bool _compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const; std::shared_ptr const _czarConfig; ///< Pointer to the CzarConfig. @@ -118,9 +118,10 @@ class CzarRegistry { TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to ///< WorkerContactInfo update. // &&& review how this _mapMtx is used, probably locks for too long a period. - mutable std::mutex _mapMtx; /// Protects _contactMap, _latestUpdate, _activeWorkerMap + mutable std::mutex _cmapMtx; /// Protects _contactMap, _latestUpdate - ActiveWorkerMap _activeWorkerMap; ///< Map of workers czar considers active. + /// Map for tracking worker aliveness, it has its own internal mutex. + std::shared_ptr const _activeWorkerMap; }; } // namespace lsst::qserv::czar diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index 5f82eb2be..75ccbdd6d 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -98,6 +98,7 @@ json HttpCzarWorkerModule::_workerCzarComIssue() { } json HttpCzarWorkerModule::_handleJobError(string const& func) { + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleJobError start"); // Metadata-only responses for the file-based protocol should not have any data // Parse and verify the json message and then kill the UberJob. @@ -133,10 +134,12 @@ json HttpCzarWorkerModule::_handleJobError(string const& func) { "HttpCzarWorkerModule::_handleJobError received " << iaEx.what() << " js=" << body().objJson); jsRet = {{"success", 0}, {"errortype", "parse"}, {"note", iaEx.what()}}; } + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleJobError end"); return jsRet; } json HttpCzarWorkerModule::_handleJobReady(string const& func) { + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleJobReady start"); // Metadata-only responses for the file-based protocol should not have any data // Parse and verify the json message and then have the uberjob import the file. @@ -173,10 +176,12 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { "HttpCzarWorkerModule::_handleJobReady received " << iaEx.what() << " js=" << body().objJson); jsRet = {{"success", 0}, {"errortype", "parse"}, {"note", iaEx.what()}}; } + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleJobReady end"); return jsRet; } json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleWorkerCzarComIssue start"); // Parse and verify the json message and then deal with the problems. json jsRet = {{"success", 1}, {"errortype", "unknown"}, {"note", "initialized"}}; try { @@ -208,6 +213,7 @@ json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { << " js=" << body().objJson); jsRet = {{"success", 0}, {"errortype", "parse"}, {"note", iaEx.what()}}; } + LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleWorkerCzarComIssue end"); return jsRet; } diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index 247efa04b..58a12773d 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -61,7 +61,6 @@ CzarContactInfo::Ptr CzarContactInfo::createFromJson(nlohmann::json const& czJso auto czHostName_ = RequestBodyJSON::required(czJson, "management-host-name"); auto czStartupTime_ = RequestBodyJSON::required(czJson, "czar-startup-time"); return create(czName_, czId_, czPort_, czHostName_, czStartupTime_); - //&&& return create(czName_, czId_, czPort_, czHostName_); } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("CzarContactInfo::createJson invalid ") << exc.what()); } @@ -70,8 +69,6 @@ CzarContactInfo::Ptr CzarContactInfo::createFromJson(nlohmann::json const& czJso std::string CzarContactInfo::dump() const { stringstream os; - //&&& os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << - // czHostName; os << "czName=" << czName << " czId=" << czId << " czPort=" << czPort << " czHostName=" << czHostName << " czStartupTime=" << czStartupTime; return os.str(); @@ -253,18 +250,14 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json std::string const& replicationInstanceId_, std::string const& replicationAuthKey_, TIMEPOINT updateTm) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& a"); try { if (jsWorkerReq["version"] != http::MetaModule::version) { LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson bad version"); return nullptr; } - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& b"); auto czInfo_ = CzarContactInfo::createFromJson(jsWorkerReq["czar"]); - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& c"); auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsWorkerReq["worker"], updateTm); - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& d"); if (czInfo_ == nullptr || wInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " @@ -272,9 +265,8 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json } auto wqsData = WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& e"); wqsData->parseLists(jsWorkerReq, updateTm); - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson &&& end"); + bool czarRestart = RequestBodyJSON::required(jsWorkerReq, "czarrestart"); if (czarRestart) { auto restartCzarId = RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelczid"); @@ -297,39 +289,26 @@ void WorkerQueryStatusData::parseListsInto(nlohmann::json const& jsWR, TIMEPOINT std::map& doneKeepF, std::map& doneDeleteF, std::map>& deadUberJobs) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& a"); auto& jsQIdDoneKeepFiles = jsWR["qiddonekeepfiles"]; - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& b"); for (auto const& qidKeep : jsQIdDoneKeepFiles) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& b1"); doneKeepF[qidKeep] = updateTm; } - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& c"); auto& jsQIdDoneDeleteFiles = jsWR["qiddonedeletefiles"]; - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& d"); for (auto const& qidDelete : jsQIdDoneDeleteFiles) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& d1"); doneDeleteF[qidDelete] = updateTm; } - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& e"); auto& jsQIdDeadUberJobs = jsWR["qiddeaduberjobs"]; - LOGS(_log, LOG_LVL_ERROR, - "WorkerQueryStatusData::parseListsInto &&& f jsQIdDeadUberJobs=" << jsQIdDeadUberJobs); // Interestingly, !jsQIdDeadUberJobs.empty() doesn't work, but .size() > 0 does. // Not having the size() check causes issues with the for loop trying to read the // first element of an empty list, which goes badly. if (jsQIdDeadUberJobs.size() > 0) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& f1"); for (auto const& qDeadUjs : jsQIdDeadUberJobs) { - LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::parseListsInto &&& f1a qDeadUjs=" << qDeadUjs); QueryId qId = qDeadUjs["qid"]; auto const& ujIds = qDeadUjs["ujids"]; auto& mapOfUj = deadUberJobs[qId]; for (auto const& ujId : ujIds) { - LOGS(_log, LOG_LVL_ERROR, - "WorkerQueryStatusData::parseListsInto &&& f1d1 qId=" << qId << " ujId=" << ujId); mapOfUj[ujId] = updateTm; } } @@ -442,7 +421,7 @@ shared_ptr WorkerCzarComIssue::serializeJson() { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _wInfo or _czInfo was null"); return jsCzarReqPtr; } - //&&&auto now = CLOCK::now(); + jsCzarR["version"] = http::MetaModule::version; jsCzarR["instance_id"] = _replicationInstanceId; jsCzarR["auth_key"] = _replicationAuthKey; @@ -460,28 +439,21 @@ WorkerCzarComIssue::Ptr WorkerCzarComIssue::createFromJson(nlohmann::json const& std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) { string const fName("WorkerCzarComIssue::createFromJson"); - LOGS(_log, LOG_LVL_WARN, fName << " &&& a"); try { if (jsCzarReq["version"] != http::MetaModule::version) { LOGS(_log, LOG_LVL_ERROR, fName << " bad version"); return nullptr; } - LOGS(_log, LOG_LVL_ERROR, fName << " &&& b"); auto czInfo_ = CzarContactInfo::createFromJson(jsCzarReq["czar"]); - LOGS(_log, LOG_LVL_ERROR, fName << " &&& c"); auto now = CLOCK::now(); auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsCzarReq["worker"], now); - LOGS(_log, LOG_LVL_ERROR, fName << " && d"); if (czInfo_ == nullptr || wInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, fName << " or worker info could not be parsed in " << jsCzarReq); } - //&&&auto wccIssue = create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); auto wccIssue = create(replicationInstanceId_, replicationAuthKey_); wccIssue->setContactInfo(wInfo_, czInfo_); - LOGS(_log, LOG_LVL_ERROR, fName << " &&& e"); wccIssue->_thoughtCzarWasDead = RequestBodyJSON::required(jsCzarReq, "thoughtczarwasdead"); - LOGS(_log, LOG_LVL_ERROR, fName << " &&& end"); return wccIssue; } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::createJson invalid ") << exc.what()); diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index 63066fc21..7b1ad0a56 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -148,20 +148,25 @@ class WorkerContactInfo { return (wId == oWId && _wHost == oWHost && _wManagementHost == oWManagementHost && _wPort == oWPort); } - void regUpdateTime(TIMEPOINT updateTime) { + void setRegUpdateTime(TIMEPOINT updateTime) { std::lock_guard lg(_rMtx); - _regUpdate = updateTime; + _regUpdateTime = updateTime; + } + + TIMEPOINT getRegUpdateTime(TIMEPOINT updateTime) { + std::lock_guard lg(_rMtx); + return _regUpdateTime; } double timeSinceRegUpdateSeconds() const { std::lock_guard lg(_rMtx); - double secs = std::chrono::duration(CLOCK::now() - _regUpdate).count(); + double secs = std::chrono::duration(CLOCK::now() - _regUpdateTime).count(); return secs; } - TIMEPOINT getRegUpdate() const { + TIMEPOINT getRegUpdateTime() const { std::lock_guard lg(_rMtx); - return _regUpdate; + return _regUpdateTime; } /// @return true if startupTime equals _wStartupTime or _wStartupTime was never set, @@ -192,7 +197,7 @@ class WorkerContactInfo { WorkerContactInfo(std::string const& wId_, std::string const& wHost_, std::string const& wManagementHost_, int wPort_, TIMEPOINT updateTime_) : wId(wId_), _wHost(wHost_), _wManagementHost(wManagementHost_), _wPort(wPort_) { - regUpdateTime(updateTime_); + setRegUpdateTime(updateTime_); } // _rMtx must be locked before calling @@ -208,7 +213,7 @@ class WorkerContactInfo { /// Last time the registry heard from this worker. The ActiveWorker class /// will use this to determine the worker's state. /// &&& Store in seconds since epoch to make atomic? - TIMEPOINT _regUpdate; + TIMEPOINT _regUpdateTime; /// "w-startup-time", it's value is set to zero until the real value is /// received from the worker. Once it is non-zero, any change indicates diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index 2256de93a..54e0d49d4 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -55,18 +55,13 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { int czrPort = 2022; string const czrHost("cz_host"); - //&&&auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost); auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); - LOGS_ERROR("&&& a czarA=" << czarA->dump()); auto czarAJs = czarA->serializeJson(); - LOGS_ERROR("&&& b czarAJs=" << czarAJs); auto czarB = lsst::qserv::http::CzarContactInfo::createFromJson(czarAJs); - LOGS_ERROR("&&& c czarB=" << czarB); BOOST_REQUIRE(czarA->compare(*czarB)); - //&&&auto czarC = lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost); auto czarC = lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost, cxrStartTime); BOOST_REQUIRE(!czarA->compare(*czarC)); @@ -77,34 +72,22 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { auto workerB = WorkerContactInfo::create("sd_workerB", "host_w2", "mgmhost_a", 3421, start); auto workerC = WorkerContactInfo::create("sd_workerC", "host_w3", "mgmhost_b", 3422, start); - LOGS_ERROR("&&& d workerA=" << workerA->dump()); - auto jsWorkerA = workerA->serializeJson(); - LOGS_ERROR("&&& e jsWorkerA=" << jsWorkerA); auto start1Sec = start + 1s; auto workerA1 = WorkerContactInfo::createFromJsonWorker(jsWorkerA, start1Sec); - LOGS_ERROR("&&& f workerA1=" << workerA1->dump()); BOOST_REQUIRE(workerA->isSameContactInfo(*workerA1)); // WorkerQueryStatusData auto wqsdA = lsst::qserv::http::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, replicationAuthKey); - LOGS_ERROR("&&& g wqsdA=" << wqsdA->dump()); - //&&&double timeoutAliveSecs = 100.0; - //&&&double timeoutDeadSecs = 2*timeoutAliveSecs; double maxLifetime = 300.0; auto jsDataA = wqsdA->serializeJson(maxLifetime); - LOGS_ERROR("&&& h jsDataA=" << *jsDataA); // Check that empty lists work. auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createFromJson(*jsDataA, replicationInstanceId, replicationAuthKey, start1Sec); - LOGS_ERROR("&&& i wqsdA1=" << wqsdA1->dump()); - LOGS_ERROR("&&& i wqsdA=" << wqsdA->dump()); auto jsDataA1 = wqsdA1->serializeJson(maxLifetime); - LOGS_ERROR("&&& i jsDataA1=" << *jsDataA1); - LOGS_ERROR("&&& i jsDataA=" << *jsDataA); BOOST_REQUIRE(*jsDataA == *jsDataA1); vector qIdsDelFiles = {7, 8, 9, 15, 25, 26, 27, 30}; @@ -114,7 +97,6 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { } jsDataA = wqsdA->serializeJson(maxLifetime); - LOGS_ERROR("&&& j jsDataA=" << jsDataA); BOOST_REQUIRE(*jsDataA != *jsDataA1); for (auto const qIdKF : qIdsKeepFiles) { @@ -123,10 +105,7 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { wqsdA->addDeadUberJobs(12, {1, 3}, start); - LOGS_ERROR("&&& i wqsdA=" << wqsdA->dump()); - jsDataA = wqsdA->serializeJson(maxLifetime); - LOGS_ERROR("&&& j jsDataA=" << *jsDataA); auto start5Sec = start + 5s; auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( @@ -139,13 +118,11 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { wqsdA->addDeadUberJobs(1059, {1, 4, 6, 7, 8, 10, 3, 22, 93}, start5Sec); jsDataA = wqsdA->serializeJson(maxLifetime); - LOGS_ERROR("&&& k jsDataA=" << *jsDataA); BOOST_REQUIRE(*jsDataA != *jsWorkerAFromJson); workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); - LOGS_ERROR("&&& l jsWorkerAFromJson=" << *jsWorkerAFromJson); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); // Make the response, which contains lists of the items handled by the workers. @@ -178,37 +155,28 @@ BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { string const czrHost("cz_host"); auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); - LOGS_ERROR("&&&i a czarA=" << czarA->dump()); auto czarAJs = czarA->serializeJson(); - LOGS_ERROR("&&&i b czarAJs=" << czarAJs); auto start = lsst::qserv::CLOCK::now(); auto workerA = WorkerContactInfo::create("sd_workerA", "host_w1", "mgmhost_a", 3421, start); - LOGS_ERROR("&&&i d workerA=" << workerA->dump()); auto jsWorkerA = workerA->serializeJson(); - LOGS_ERROR("&&&i e jsWorkerA=" << jsWorkerA); // WorkerCzarComIssue - //&&&auto wccIssueA = lsst::qserv::http::WorkerCzarComIssue::create(workerA, czarA, replicationInstanceId, - //replicationAuthKey); auto wccIssueA = lsst::qserv::http::WorkerCzarComIssue::create(replicationInstanceId, replicationAuthKey); wccIssueA->setContactInfo(workerA, czarA); BOOST_REQUIRE(wccIssueA->needToSend() == false); wccIssueA->setThoughtCzarWasDead(true); BOOST_REQUIRE(wccIssueA->needToSend() == true); - LOGS_ERROR("&&&i f wccIssue=" << wccIssueA->dump()); - auto jsIssueA = wccIssueA->serializeJson(); - LOGS_ERROR("&&&i g jsIssue=" << *jsIssueA); auto wccIssueA1 = lsst::qserv::http::WorkerCzarComIssue::createFromJson(*jsIssueA, replicationInstanceId, replicationAuthKey); - LOGS_ERROR("&&&i i wccIssueA1=" << wccIssueA1->dump()); - LOGS_ERROR("&&&i i wccIssueA=" << wccIssueA->dump()); + LOGS_ERROR("&&& wccIssueA1=" << wccIssueA1->dump()); + LOGS_ERROR("&&& wccIssueA=" << wccIssueA->dump()); auto jsIssueA1 = wccIssueA1->serializeJson(); - LOGS_ERROR("&&&i i jsIssueA1=" << *jsIssueA1); - LOGS_ERROR("&&&i i jsIssueA=" << *jsIssueA); + LOGS_ERROR("&&& jsIssueA1=" << *jsIssueA1); + LOGS_ERROR("&&& jsIssueA=" << *jsIssueA); BOOST_REQUIRE(*jsIssueA == *jsIssueA1); // &&& Test with items in lists. diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index fa03d6c56..45dc6c031 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -190,8 +190,7 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { // Create the JobQuery and put it in the map. auto jobStatus = make_shared(); Ptr thisPtr = shared_from_this(); - MarkCompleteFunc::Ptr mcf = make_shared(thisPtr, jobDesc->id()); - jobQuery = JobQuery::create(thisPtr, jobDesc, jobStatus, mcf, _id); + jobQuery = JobQuery::create(thisPtr, jobDesc, jobStatus, _id); QSERV_LOGCONTEXT_QUERY_JOB(jobQuery->getQueryId(), jobQuery->getJobId()); @@ -227,15 +226,6 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { return jobQuery; } -void Executive::queueJobStart(util::PriorityCommand::Ptr const& cmd) { - _jobStartCmdList.push_back(cmd); - if (_scanInteractive) { - _qdispPool->queCmd(cmd, 0); - } else { - _qdispPool->queCmd(cmd, 1); - } -} - void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { if (_scanInteractive) { _qdispPool->queCmd(cmd, 3); diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index db02a9c43..8d603fdf5 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -50,9 +50,6 @@ #include "util/threadSafe.h" #include "util/ThreadPool.h" -// TODO:UJ replace with better enable/disable feature, or just use only UberJobs -#define uberJobsEnabled 1 // &&& delete - namespace lsst::qserv { namespace ccontrol { @@ -133,11 +130,8 @@ class Executive : public std::enable_shared_from_this { // Queue `uberJob` to be run using the QDispPool. void runUberJob(std::shared_ptr const& uberJob); - /// Queue a job to be sent to a worker so it can be started. - void queueJobStart(std::shared_ptr const& cmd); // &&& delete ??? - /// Queue `cmd`, using the QDispPool, so it can be used to collect the result file. - void queueFileCollect(std::shared_ptr const& cmd); // &&& delete ??? + void queueFileCollect(std::shared_ptr const& cmd); /// Waits for all jobs on _jobStartCmdList to start. This should not be called /// before ALL jobs have been added to the pool. @@ -340,26 +334,6 @@ class Executive : public std::enable_shared_from_this { std::atomic _readyToExecute{false}; }; -/// TODO:UJ delete - MarkCompleteFunc is not needed with uberjobs. //&&&QM -class MarkCompleteFunc { -public: - typedef std::shared_ptr Ptr; - - MarkCompleteFunc(Executive::Ptr const& e, JobId jobId) : _executive(e), _jobId(jobId) {} - virtual ~MarkCompleteFunc() {} - - virtual void operator()(bool success) { - auto exec = _executive.lock(); - if (exec != nullptr) { - exec->markCompleted(_jobId, success); - } - } - -private: - std::weak_ptr _executive; - JobId _jobId; -}; - } // namespace qdisp } // namespace lsst::qserv diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h index b6b18d325..a030d1612 100644 --- a/src/qdisp/JobBase.h +++ b/src/qdisp/JobBase.h @@ -58,9 +58,6 @@ class JobBase : public std::enable_shared_from_this { virtual QueryId getQueryId() const = 0; virtual UberJobId getJobId() const = 0; virtual std::string const& getIdStr() const = 0; - //&&&virtual std::shared_ptr getQdispPool() = 0; - //&&& virtual std::string const& getPayload() const = 0; ///< const& in return type is essential for - // xrootd virtual std::shared_ptr getRespHandler() = 0; virtual std::shared_ptr getStatus() = 0; virtual bool getScanInteractive() const = 0; diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index ab4234545..353ce1c18 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -65,9 +65,16 @@ JobDescription::JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, R _mock(mock) {} bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase) { + LOGS(_log, LOG_LVL_ERROR, + "JobDescription::incrAttemptCountScrubResultsJson &&&a qId=" << _queryId << " jId=" << _jobId + << " attempt=" << _attemptCount); + if (increase) { ++_attemptCount; } + LOGS(_log, LOG_LVL_ERROR, + "JobDescription::incrAttemptCountScrubResultsJson &&&b qId=" << _queryId << " jId=" << _jobId + << " attempt=" << _attemptCount); if (_attemptCount >= MAX_JOB_ATTEMPTS) { LOGS(_log, LOG_LVL_ERROR, "attemptCount greater than maximum number of retries " << _attemptCount); return false; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 85c2b4efc..114d3efef 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -44,16 +44,13 @@ using namespace std; namespace lsst::qserv::qdisp { JobQuery::JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, - qmeta::JobStatus::Ptr const& jobStatus, - shared_ptr const& markCompleteFunc, QueryId qid) + qmeta::JobStatus::Ptr const& jobStatus, QueryId qid) : JobBase(), _executive(executive), _jobDescription(jobDescription), - _markCompleteFunc(markCompleteFunc), _jobStatus(jobStatus), _qid(qid), _idStr(QueryIdHelper::makeIdStr(qid, getJobId())) { - //&&&_qdispPool = executive->getQdispPool(); LOGS(_log, LOG_LVL_TRACE, "JobQuery desc=" << _jobDescription); } @@ -145,9 +142,9 @@ int JobQuery::getAttemptCount() const { return _jobDescription->getAttemptCount(); } -//&&&string const& JobQuery::getPayload() const { return _jobDescription->payload(); } - -void JobQuery::callMarkCompleteFunc(bool success) { _markCompleteFunc->operator()(success); } +void JobQuery::callMarkCompleteFunc(bool success) { + throw util::Bug(ERR_LOC, "&&& JobQuery::callMarkCompleteFunc should not be called, ever"); +} ostream& JobQuery::dumpOS(ostream& os) const { return os << "{" << getIdStr() << _jobDescription << " " << _jobStatus << "}"; diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index c6fcc0829..9a8e13962 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -40,7 +40,6 @@ namespace lsst::qserv::qdisp { -//&&&class QdispPool; class QueryRequest; /// This class is used to describe, monitor, and control a single query to a worker. @@ -53,9 +52,8 @@ class JobQuery : public JobBase { /// Factory function to make certain a shared_ptr is used and _setup is called. static JobQuery::Ptr create(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, - qmeta::JobStatus::Ptr const& jobStatus, - std::shared_ptr const& markCompleteFunc, QueryId qid) { - Ptr jq = Ptr(new JobQuery(executive, jobDescription, jobStatus, markCompleteFunc, qid)); + qmeta::JobStatus::Ptr const& jobStatus, QueryId qid) { + Ptr jq = Ptr(new JobQuery(executive, jobDescription, jobStatus, qid)); jq->_setup(); return jq; } @@ -78,15 +76,12 @@ class JobQuery : public JobBase { std::shared_ptr getExecutive() override { return _executive.lock(); } - //&&&std::shared_ptr getQdispPool() override { return _qdispPool; } - std::ostream& dumpOS(std::ostream& os) const override; /// Make a copy of the job description. JobQuery::_setup() must be called after creation. /// Do not call this directly, use create. JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, - qmeta::JobStatus::Ptr const& jobStatus, - std::shared_ptr const& markCompleteFunc, QueryId qid); + qmeta::JobStatus::Ptr const& jobStatus, QueryId qid); /// If the UberJob is unassigned, change the _uberJobId to ujId. bool setUberJobId(UberJobId ujId) { @@ -131,7 +126,6 @@ class JobQuery : public JobBase { std::weak_ptr _executive; /// The job description needs to survive until the task is complete. JobDescription::Ptr _jobDescription; - std::shared_ptr _markCompleteFunc; // JobStatus has its own mutex. qmeta::JobStatus::Ptr _jobStatus; ///< Points at status in Executive::_statusMap @@ -148,8 +142,6 @@ class JobQuery : public JobBase { // Cancellation std::atomic _cancelled{false}; ///< Lock to make sure cancel() is only called once. - //&&& std::shared_ptr _qdispPool; - /// The UberJobId that this job is assigned to. Values less than zero /// indicate this job is unassigned. To prevent race conditions, /// an UberJob may only unassign a job if it has the same ID as diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 0389632cc..c581229d0 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -120,14 +120,6 @@ atomic taskSequence{0}; ///< Unique identifier source for Task. /// available to define the action to take when this task is run, so /// Command::setFunc() is used set the action later. This is why /// the util::CommandThreadPool is not called here. -/* &&& -Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, - shared_ptr const& userQueryInfo, size_t templateId, bool hasSubchunks, - int subchunkId, string const& db, proto::ScanInfo const& scanInfo, bool scanInteractive, - int maxTableSize, vector const& fragSubTables, vector const& fragSubchunkIds, - shared_ptr const& sc, std::shared_ptr const& -queryStats_, uint16_t resultsHttpPort) : _userQueryInfo(userQueryInfo), -*/ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, string const& db, proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSize, @@ -199,15 +191,7 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun _dbTblsAndSubchunks = make_unique(dbTbls_, subchunksVect_); } -Task::~Task() { - /* &&& - _userQueryInfo.reset(); - UserQueryInfo::uqMapErase(_qId); - if (UserQueryInfo::uqMapGet(_qId) == nullptr) { - LOGS(_log, LOG_LVL_TRACE, "~Task Cleared uqMap entry for _qId=" << _qId); - } - */ -} +Task::~Task() {} std::vector Task::createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, @@ -220,7 +204,6 @@ std::vector Task::createTasksForChunk( UberJobId ujId = ujData->getUberJobId(); CzarIdType czId = ujData->getCzarId(); - //&&&UserQueryInfo::Ptr userQueryInfo = UserQueryInfo::uqMapInsert(qId); wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); @@ -287,13 +270,6 @@ std::vector Task::createTasksForChunk( if (fragSubchunkIds.empty()) { bool const noSubchunks = false; int const subchunkId = -1; - /* &&& - auto task = Task::Ptr(new Task( - ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, userQueryInfo, - templateId, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, - maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, queryStats, - resultsHttpPort)); - */ auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, @@ -308,14 +284,6 @@ std::vector Task::createTasksForChunk( jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); - /* &&& - auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, - fragmentNumber, userQueryInfo, templateId, - hasSubchunks, subchunkId, jdQuerySpecDb, scanInfo, - scanInteractive, maxTableSizeMb, fragSubTables, - fragSubchunkIds, sendChannel, queryStats, - resultsHttpPort)); - */ vect.push_back(task); } } @@ -377,10 +345,9 @@ string Task::getQueryString() const { auto uQInfo = qStats->getUserQueryInfo(); string qs = uQInfo->getTemplate(_templateId); - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& a qs=" << qs); boost::algorithm::replace_all(qs, CHUNK_TAG, to_string(_chunkId)); boost::algorithm::replace_all(qs, SUBCHUNK_TAG, to_string(_subchunkId)); - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& b qs=" << qs); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " qs=" << qs); return qs; } diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 9580040d8..0a5932b9d 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -309,7 +309,6 @@ class Task : public util::CommandForThreadPool { } private: - //&&&std::weak_ptr _userQueryInfo; ///< Details common to Tasks in this UserQuery. std::shared_ptr _sendChannel; ///< Send channel. uint64_t const _tSeq = 0; ///< identifier for the specific task diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index 3098908eb..d4cb0c734 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -108,7 +108,6 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount string const requestContext = "Worker: '" + http::method2string(method) + "' request to '" + url + "'"; string const requestStr = request.dump(); _queueUJResponse(method, headers, url, requestContext, requestStr); - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& end"); } bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const& task, @@ -156,21 +155,15 @@ void UberJobData::_queueUJResponse(http::Method method_, std::vectorgetWPool(); } - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd wPool=" << wPool); auto cmdTransmit = UJTransmitCmd::create(_foreman, shared_from_this(), method_, headers_, url_, requestContext_, requestStr_); - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& created UJTransmitCmd wPool=" << wPool); if (wPool == nullptr) { // No thread pool. Run the command now. This should only happen in unit tests. - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd direct run action"); cmdTransmit->action(nullptr); } else { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit"); if (_scanInteractive) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit_0"); wPool->queCmd(cmdTransmit, 0); } else { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& creating UJTransmitCmd queue transmit_1"); wPool->queCmd(cmdTransmit, 1); } } @@ -204,7 +197,6 @@ void UJTransmitCmd::action(util::CmdData* data) { ResetSelf resetSelf(this); _attemptCount++; - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start attempt=" << _attemptCount); auto ujPtr = _ujData.lock(); if (ujPtr == nullptr || ujPtr->getCancelled()) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " UberJob was cancelled " << _attemptCount); @@ -220,11 +212,8 @@ void UJTransmitCmd::action(util::CmdData* data) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); // There's no point in re-sending as the czar got the message and didn't like // it. - // &&& maybe add this czId+ujId to a list of failed uberjobs that can be put - // &&& status return??? Probably overkill. } } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start d except"); LOGS(_log, LOG_LVL_WARN, cName(__func__) + " " + _requestContext + " failed, ex: " + ex.what()); } @@ -262,7 +251,6 @@ void UJTransmitCmd::action(util::CmdData* data) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _selfPtr was null, assuming job killed."); } } - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& start end"); } void UJTransmitCmd::kill() { @@ -273,7 +261,6 @@ void UJTransmitCmd::kill() { if (sPtr == nullptr) { return; } - // &&& TODO:UJ Is there anything that should be done here??? } UJTransmitCmd::Ptr UJTransmitCmd::duplicate() { diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 0ccdf7c7e..76d335411 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -196,7 +196,7 @@ class UJTransmitCmd : public util::PriorityCommand { std::string const _requestContext; std::string const _requestStr; int _attemptCount = 0; ///< How many attempts have been made to transmit this. - util::InstanceCount _ic{cName("&&&")}; + util::InstanceCount _ic{cName("UJTransmitCmd&&&")}; }; } // namespace lsst::qserv::wbase diff --git a/src/wconfig/WorkerConfig.h b/src/wconfig/WorkerConfig.h index 9b6d682b5..584aa3209 100644 --- a/src/wconfig/WorkerConfig.h +++ b/src/wconfig/WorkerConfig.h @@ -210,11 +210,6 @@ class WorkerConfig { return _ReservedInteractiveSqlConnections->getVal(); } - /* &&& - /// @return the maximum number of gigabytes that can be used by StreamBuffers - unsigned int getBufferMaxTotalGB() const { return _bufferMaxTotalGB->getVal(); } //&&& delete - */ - /// @return the maximum number of concurrent transmits to a czar unsigned int getMaxTransmits() const { return _maxTransmits->getVal(); } @@ -364,10 +359,6 @@ class WorkerConfig { util::ConfigValTUInt::create(_configValMap, "sqlconnections", "maxsqlconn", notReq, 800); CVTUIntPtr _ReservedInteractiveSqlConnections = util::ConfigValTUInt::create( _configValMap, "sqlconnections", "reservedinteractivesqlconn", notReq, 50); - /* &&& - CVTUIntPtr _bufferMaxTotalGB = - util::ConfigValTUInt::create(_configValMap, "transmit", "buffermaxtotalgb", notReq, 41); - */ CVTUIntPtr _maxTransmits = util::ConfigValTUInt::create(_configValMap, "transmit", "maxtransmits", notReq, 40); CVTIntPtr _maxPerQid = util::ConfigValTInt::create(_configValMap, "transmit", "maxperqid", notReq, 3); diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index 179022167..db4d7626f 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -132,14 +132,6 @@ Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigne _mark = make_shared(ERR_LOC, "Forman Test Msg"); - /* &&& - int qPoolSize = _czarConfig->getQdispPoolSize(); - int maxPriority = std::max(0, _czarConfig->getQdispMaxPriority()); - string vectRunSizesStr = _czarConfig->getQdispVectRunSizes(); - vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); - string vectMinRunningSizesStr = _czarConfig->getQdispVectMinRunningSizes(); - vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); - */ int qPoolSize = 50; // &&& TODO:UJ put in config int maxPriority = 2; // &&& TODO:UJ put in config string vectRunSizesStr = "10:10:10:10"; // &&& TODO:UJ put in config diff --git a/src/wdb/ChunkResource.cc b/src/wdb/ChunkResource.cc index a6eb90ef4..a9fe100e5 100644 --- a/src/wdb/ChunkResource.cc +++ b/src/wdb/ChunkResource.cc @@ -48,7 +48,6 @@ #include "util/Bug.h" #include "util/IterableFormatter.h" #include "wbase/Base.h" -//&&&#include "wdb/QuerySql.h" namespace { diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index 35501c76f..7e3ab7b76 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -268,11 +268,8 @@ bool QueryRunner::_dispatchChannel() { if (taskSched != nullptr) { taskSched->histTimeOfRunningTasks->addEntry(primeT.getElapsed()); LOGS(_log, LOG_LVL_DEBUG, "QR " << taskSched->histTimeOfRunningTasks->getString("run")); - LOGS(_log, LOG_LVL_WARN, - "&&&DASH QR " << taskSched->histTimeOfRunningTasks->getString("run")); } else { LOGS(_log, LOG_LVL_ERROR, "QR runtaskSched == nullptr"); - LOGS(_log, LOG_LVL_ERROR, "&&&DASH QR runtaskSched == nullptr"); } double runTimeSeconds = primeT.getElapsed(); double subchunkRunTimeSeconds = subChunkT.getElapsed(); @@ -348,16 +345,6 @@ void QueryRunner::cancel() { break; } } - - /* &&& - auto streamB = _streamBuf.lock(); - if (streamB != nullptr) { - streamB->cancel(); - } - - // The send channel will die naturally on its own when xrootd stops talking to it - // or other tasks call _transmitCancelledError(). - */ } QueryRunner::~QueryRunner() {} diff --git a/src/wdb/QueryRunner.h b/src/wdb/QueryRunner.h index 785496772..a881075f0 100644 --- a/src/wdb/QueryRunner.h +++ b/src/wdb/QueryRunner.h @@ -75,8 +75,6 @@ class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_fro /// by Task::cancel(), so if this needs to be cancelled elsewhere, /// call Task::cancel(). /// This should kill an in progress SQL command. - //&&&/// It also tries to unblock `_streamBuf` to keep the thread - //&&&/// from being blocked forever. void cancel() override; protected: @@ -93,8 +91,6 @@ class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_fro bool _dispatchChannel(); MYSQL_RES* _primeResult(std::string const& query); ///< Obtain a result handle for a query. - //&&&static size_t _getDesiredLimit(); - wbase::Task::Ptr const _task; ///< Actual task qmeta::CzarId _czarId = 0; ///< To be replaced with the czarId of the requesting czar. diff --git a/src/wpublish/QueryStatistics.cc b/src/wpublish/QueryStatistics.cc index bc06eea58..2ca96d7f3 100644 --- a/src/wpublish/QueryStatistics.cc +++ b/src/wpublish/QueryStatistics.cc @@ -189,13 +189,6 @@ QueryStatistics::SchedTasksInfoMap QueryStatistics::getSchedulerTasksInfoMap() { return _taskSchedInfoMap; } -/* &&& -void QueryStatistics::touch(TIMEPOINT const now) { - lock_guard lock(_qStatsMtx); - _touched = now; -} -*/ - void QueryStatistics::addTask(TIMEPOINT const now) { lock_guard lock(_qStatsMtx); _touched = now; diff --git a/src/wpublish/QueryStatistics.h b/src/wpublish/QueryStatistics.h index c15e8e9f6..9d208e037 100644 --- a/src/wpublish/QueryStatistics.h +++ b/src/wpublish/QueryStatistics.h @@ -44,7 +44,6 @@ #include "util/InstanceCount.h" //&&& namespace lsst::qserv::wbase { -//&&&class Histogram; class UserQueryInfo; } // namespace lsst::qserv::wbase @@ -97,7 +96,6 @@ class QueryStatistics { void addTaskTransmit(double timeSeconds, int64_t bytesTransmitted, int64_t rowsTransmitted, double bufferFillSecs); - //&&&void touch(TIMEPOINT const now); void addTask(TIMEPOINT const now); void addTaskRunning(TIMEPOINT const now); bool addTaskCompleted(TIMEPOINT const now, double const taskDuration); diff --git a/src/xrdsvc/HttpReplicaMgtModule.cc b/src/xrdsvc/HttpReplicaMgtModule.cc index e7d61d95b..91692aa92 100644 --- a/src/xrdsvc/HttpReplicaMgtModule.cc +++ b/src/xrdsvc/HttpReplicaMgtModule.cc @@ -79,12 +79,6 @@ HttpReplicaMgtModule::HttpReplicaMgtModule(string const& context, shared_ptr const& req, shared_ptr const& resp) : HttpModule(context, foreman, req, resp) {} -/* &&& - : HttpModule(context, foreman, req, resp), - _providerServer(dynamic_cast(XrdSsiProviderLookup)), - _clusterManager(_providerServer->GetClusterManager()), - _dataContext(_clusterManager->DataContext()) {} - */ json HttpReplicaMgtModule::executeImpl(string const& subModuleName) { string const func = string(__func__) + "[sub-module='" + subModuleName + "']"; @@ -337,12 +331,8 @@ void HttpReplicaMgtModule::_modifyChunk(string const& func, int chunk, string co // copy of the inventory. After that modify both (persistent and // transient) inventories. if (Direction::ADD == direction) { - //&&&_clusterManager->Added(resource.data()); - //&&&if (_dataContext) _providerServer->GetChunkInventory().add(database, chunk); foreman()->chunkInventory()->add(database, chunk, foreman()->mySqlConfig()); } else { - //&&&_clusterManager->Removed(resource.data()); - //&&&if (_dataContext) _providerServer->GetChunkInventory().remove(database, chunk); foreman()->chunkInventory()->remove(database, chunk, foreman()->mySqlConfig()); } } catch (wpublish::InvalidParamError const& ex) { diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index b4fb81da5..224862ab3 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -131,7 +131,6 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId); // Get or create QueryStatistics and UserQueryInfo instances. - //&&&auto queryStats = foreman()->addQueryId(ujQueryId); auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzarId); auto userQueryInfo = queryStats->getUserQueryInfo(); @@ -344,15 +343,6 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // Return a message containing lists of the queries that were cancelled. jsRet = wqsData->serializeResponseJson(foreman()->getWorkerStartupTime()); - // &&& queue sending WorkerCzarComIssue if needed. - /* &&& - auto const wczComIssue = wCzarInfo->getWorkerCzarComIssue(); - if (wczComIssue != nullptr && wczComIssue->needToSend()) { - LOGS(_log, LOG_LVL_ERROR, "&&& NEED CODE to queue wczComIssue message, do not queue more than one at a - time."); - // Limit the sending to happening after czar sends status - } - */ wCzarInfo->sendWorkerCzarComIssueIfNeeded(wqsData->getWInfo(), wqsData->getCzInfo()); return jsRet; diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index 7aa2ef85b..4ae7d6e76 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -278,17 +278,7 @@ SsiService::~SsiService() { } void SsiService::ProcessRequest(XrdSsiRequest& reqRef, XrdSsiResource& resRef) { -#if 0 //&&& - LOGS(_log, LOG_LVL_DEBUG, "Got request call where rName is: " << resRef.rName); - auto request = SsiRequest::newSsiRequest(resRef.rName, _foreman); - - // Continue execution in the session object as SSI gave us a new thread. - // Object deletes itself when finished is called. - // - request->execute(reqRef); -#else //&&& LOGS(_log, LOG_LVL_ERROR, "SsiService::ProcessRequest got called"); -#endif //&&& } } // namespace lsst::qserv::xrdsvc From 5e3642a60bdfc78ce5c72c0ce2697c3b1136ff7a Mon Sep 17 00:00:00 2001 From: John Gates Date: Tue, 15 Oct 2024 15:46:39 -0700 Subject: [PATCH 10/22] Fixed problems with rowlimit and WorkerCzarComIssue. --- src/czar/ActiveWorker.cc | 13 +++-- src/czar/CzarChunkMap.cc | 6 +- src/czar/HttpCzarWorkerModule.cc | 7 ++- src/http/Client.cc | 1 - src/http/WorkerQueryStatusData.cc | 23 ++++---- src/http/WorkerQueryStatusData.h | 5 +- src/http/testStatusData.cc | 6 +- src/proto/ScanTableInfo.h | 2 +- src/qdisp/Executive.cc | 11 +++- src/qdisp/Executive.h | 6 ++ src/qdisp/JobDescription.cc | 8 --- src/qdisp/JobDescription.h | 3 +- src/qdisp/UberJob.cc | 14 +++-- src/qdisp/UberJob.h | 4 +- src/util/CMakeLists.txt | 1 - src/util/ConfigValMap.h | 2 + src/util/InstanceCount.cc | 6 +- src/util/xrootd.cc | 91 ------------------------------ src/util/xrootd.h | 44 --------------- src/wbase/FileChannelShared.cc | 63 +++++++++++++++++---- src/wbase/FileChannelShared.h | 21 +++++-- src/wbase/SendChannel.h | 2 +- src/wbase/Task.cc | 17 +++--- src/wbase/Task.h | 10 ++++ src/wbase/UberJobData.cc | 9 ++- src/wbase/UberJobData.h | 17 ++++-- src/wdb/QueryRunner.cc | 3 +- src/wdb/testQueryRunner.cc | 28 +++------ src/wpublish/QueryStatistics.h | 1 - src/xrdsvc/HttpReplicaMgtModule.h | 2 +- src/xrdsvc/HttpWorkerCzarModule.cc | 58 +++++++++---------- 31 files changed, 209 insertions(+), 275 deletions(-) delete mode 100644 src/util/xrootd.cc delete mode 100644 src/util/xrootd.h diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 921d678a7..6c33f616a 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -145,6 +145,7 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(sendStatusMsgFunc)); auto qdisppool = czar::Czar::getCzar()->getQdispPool(); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " queuing message"); qdisppool->queCmd(cmd, 1); } @@ -169,8 +170,11 @@ void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, http::Client client(method, url, jsWorkerReq.dump(), headers); bool transmitSuccess = false; string exceptionWhat; + json response; try { - json const response = client.readAsJson(); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " read start"); + response = client.readAsJson(); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " read end"); if (0 != response.at("success").get()) { bool startupTimeChanged = false; tie(transmitSuccess, startupTimeChanged) = _wqsData->handleResponseJson(response); @@ -180,14 +184,15 @@ void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, czar::Czar::getCzar()->killIncompleteUbjerJobsOn(wInf->wId); } } else { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " response success=0"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " transmit failure response success=0 " << response); } } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, requestContext + " failed, ex: " + ex.what()); + LOGS(_log, LOG_LVL_ERROR, requestContext + " transmit failure, ex: " + ex.what()); exceptionWhat = ex.what(); } if (!transmitSuccess) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " transmit failure"); + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " transmit failure " << jsWorkerReq.dump() << " resp=" << response); } } diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 11f7865d1..5487f48ec 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -260,11 +260,11 @@ bool CzarChunkMap::WorkerChunksData::isDead() { } } auto wState = _activeWorker->getState(); - bool res = wState == ActiveWorker::DEAD; - if (!res) { + bool dead = wState == ActiveWorker::DEAD; + if (dead) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " is dead"); } - return res; + return dead; } string CzarChunkMap::WorkerChunksData::dump() const { diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index 75ccbdd6d..1c80e4c85 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -145,6 +145,8 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { // Parse and verify the json message and then have the uberjob import the file. json jsRet = {{"success", 1}, {"errortype", "unknown"}, {"note", "initialized"}}; try { + // &&& TODO:UJ file response - move construction and parsing + // &&& TODO:UJ to a class so it can be added to WorkerCzarComIssue // See qdisp::UberJob::runUberJob() for json message construction. string const targetWorkerId = body().required("workerid"); string const czarName = body().required("czar"); @@ -183,7 +185,7 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleWorkerCzarComIssue start"); // Parse and verify the json message and then deal with the problems. - json jsRet = {{"success", 1}, {"errortype", "unknown"}, {"note", "initialized"}}; + json jsRet = {{"success", 0}, {"errortype", "unknown"}, {"note", "initialized"}}; try { string const replicationInstanceId = cconfig::CzarConfig::instance()->replicationInstanceId(); string const replicationAuthKey = cconfig::CzarConfig::instance()->replicationAuthKey(); @@ -206,6 +208,8 @@ json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { execPtr->killIncompleteUberJobsOnWorker(wId); } } + jsRet = wccIssue->serializeResponseJson(); + LOGS(_log, LOG_LVL_TRACE, "HttpCzarWorkerModule::_handleWorkerCzarComIssue jsRet=" << jsRet.dump()); } catch (std::invalid_argument const& iaEx) { LOGS(_log, LOG_LVL_ERROR, @@ -213,7 +217,6 @@ json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { << " js=" << body().objJson); jsRet = {{"success", 0}, {"errortype", "parse"}, {"note", iaEx.what()}}; } - LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleWorkerCzarComIssue end"); return jsRet; } diff --git a/src/http/Client.cc b/src/http/Client.cc index 1f4e2c690..ae713f1f8 100644 --- a/src/http/Client.cc +++ b/src/http/Client.cc @@ -146,7 +146,6 @@ void Client::read(CallbackType const& onDataRead) { } _curlEasyErrorChecked("curl_easy_setopt(CURLOPT_HTTPHEADER)", curl_easy_setopt(_hcurl, CURLOPT_HTTPHEADER, _hlist)); - _curlEasyErrorChecked("curl_easy_setopt(CURLOPT_FAILONERROR)", curl_easy_setopt(_hcurl, CURLOPT_FAILONERROR, 1L)); _curlEasyErrorChecked("curl_easy_setopt(CURLOPT_WRITEFUNCTION)", diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index 58a12773d..ee251b048 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -141,11 +141,12 @@ shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { jsWorkerR["version"] = http::MetaModule::version; jsWorkerR["instance_id"] = _replicationInstanceId; jsWorkerR["auth_key"] = _replicationAuthKey; - jsWorkerR["czar"] = _czInfo->serializeJson(); + jsWorkerR["czarinfo"] = _czInfo->serializeJson(); { lock_guard lgI(_infoMtx); if (_wInfo != nullptr) { - jsWorkerR["worker"] = _wInfo->serializeJson(); + jsWorkerR["workerinfo"] = _wInfo->serializeJson(); + jsWorkerR["worker"] = _wInfo->wId; } else { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " wInfo is null"); } @@ -256,8 +257,8 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json return nullptr; } - auto czInfo_ = CzarContactInfo::createFromJson(jsWorkerReq["czar"]); - auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsWorkerReq["worker"], updateTm); + auto czInfo_ = CzarContactInfo::createFromJson(jsWorkerReq["czarinfo"]); + auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsWorkerReq["workerinfo"], updateTm); if (czInfo_ == nullptr || wInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " @@ -425,12 +426,13 @@ shared_ptr WorkerCzarComIssue::serializeJson() { jsCzarR["version"] = http::MetaModule::version; jsCzarR["instance_id"] = _replicationInstanceId; jsCzarR["auth_key"] = _replicationAuthKey; - jsCzarR["czar"] = _czInfo->serializeJson(); - jsCzarR["worker"] = _wInfo->serializeJson(); + jsCzarR["czarinfo"] = _czInfo->serializeJson(); + jsCzarR["czar"] = _czInfo->czName; + jsCzarR["workerinfo"] = _wInfo->serializeJson(); jsCzarR["thoughtczarwasdead"] = _thoughtCzarWasDead; - // &&& add list of failed transmits + // TODO:UJ add list of failed transmits return jsCzarReqPtr; } @@ -439,15 +441,16 @@ WorkerCzarComIssue::Ptr WorkerCzarComIssue::createFromJson(nlohmann::json const& std::string const& replicationInstanceId_, std::string const& replicationAuthKey_) { string const fName("WorkerCzarComIssue::createFromJson"); + LOGS(_log, LOG_LVL_DEBUG, fName); try { if (jsCzarReq["version"] != http::MetaModule::version) { LOGS(_log, LOG_LVL_ERROR, fName << " bad version"); return nullptr; } - auto czInfo_ = CzarContactInfo::createFromJson(jsCzarReq["czar"]); + auto czInfo_ = CzarContactInfo::createFromJson(jsCzarReq["czarinfo"]); auto now = CLOCK::now(); - auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsCzarReq["worker"], now); + auto wInfo_ = WorkerContactInfo::createFromJsonWorker(jsCzarReq["workerinfo"], now); if (czInfo_ == nullptr || wInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, fName << " or worker info could not be parsed in " << jsCzarReq); } @@ -464,7 +467,7 @@ WorkerCzarComIssue::Ptr WorkerCzarComIssue::createFromJson(nlohmann::json const& json WorkerCzarComIssue::serializeResponseJson() { json jsResp = {{"success", 1}, {"errortype", "none"}, {"note", ""}}; - // TODO:UJ &&& add lists of uberjobs that are scheduled to have files collected because of this message. + // TODO:UJ add lists of uberjobs that are scheduled to have files collected because of this message. return jsResp; } diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index 7b1ad0a56..f128d6264 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -365,7 +365,7 @@ class WorkerQueryStatusData { /// with that czar. Result files will remain until garbage cleanup or the czar /// calls for their removal. /// TODO:UJ &&& UberJob complete messages that failed to be sent to the czar -/// TODO:UJ &&& will be added to this message. +/// TODO:UJ &&& will be added to this message. uber job file response /// Upon successful completion, the worker will clear all values set by the /// the czar. /// This message is expected to only be needed rarely. @@ -395,7 +395,8 @@ class WorkerCzarComIssue { /// &&& doc bool needToSend() const { std::lock_guard lg(_wciMtx); - return _thoughtCzarWasDead; // &&& or list of failed transmits not empty. + // TODO:UJ &&& or list of failed transmits not empty. + return _thoughtCzarWasDead; } /// &&& doc diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index 54e0d49d4..1a03c1d90 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -172,14 +172,10 @@ BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { auto wccIssueA1 = lsst::qserv::http::WorkerCzarComIssue::createFromJson(*jsIssueA, replicationInstanceId, replicationAuthKey); - LOGS_ERROR("&&& wccIssueA1=" << wccIssueA1->dump()); - LOGS_ERROR("&&& wccIssueA=" << wccIssueA->dump()); auto jsIssueA1 = wccIssueA1->serializeJson(); - LOGS_ERROR("&&& jsIssueA1=" << *jsIssueA1); - LOGS_ERROR("&&& jsIssueA=" << *jsIssueA); BOOST_REQUIRE(*jsIssueA == *jsIssueA1); - // &&& Test with items in lists. + // TODO:UJ Test with items in lists. } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/proto/ScanTableInfo.h b/src/proto/ScanTableInfo.h index 76d03e5f4..d30e4d04d 100644 --- a/src/proto/ScanTableInfo.h +++ b/src/proto/ScanTableInfo.h @@ -35,7 +35,7 @@ namespace lsst::qserv::proto { /// Structure to store shared scan information for a single table. /// -struct ScanTableInfo { // &&& check if still useful +struct ScanTableInfo { // TODO:UJ check if still useful using ListOf = std::vector; ScanTableInfo() = default; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 45dc6c031..7653cc54e 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -228,9 +228,9 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { if (_scanInteractive) { - _qdispPool->queCmd(cmd, 3); + _qdispPool->queCmd(cmd, 2); } else { - _qdispPool->queCmd(cmd, 4); + _qdispPool->queCmd(cmd, 3); } } @@ -732,6 +732,13 @@ void Executive::_setupLimit() { _limitSquashApplies = hasLimit && !(groupBy || orderBy || allChunksRequired); } +int Executive::getUjRowLimit() const { + if (_limitSquashApplies) { + return _limit; + } + return 0; +} + void Executive::addResultRows(int64_t rowCount) { _totalResultRows += rowCount; } void Executive::checkLimitRowComplete() { diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 8d603fdf5..1f0af6d26 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -183,6 +183,12 @@ class Executive : public std::enable_shared_from_this { /// rows already read in. void checkLimitRowComplete(); + //&&&int getRowLimit() const { return _limit; } + + /// Returns the maximum number of rows the worker needs for the LIMIT clause, or + /// a value <= 0 there's no limit that can be applied at the worker. + int getUjRowLimit() const; + /// @return _limitRowComplete, which can only be meaningful if the /// user query has not been cancelled. bool isLimitRowComplete() { return _limitRowComplete && !_cancelled; } diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index 353ce1c18..dca19f52c 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -65,16 +65,9 @@ JobDescription::JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, R _mock(mock) {} bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase) { - LOGS(_log, LOG_LVL_ERROR, - "JobDescription::incrAttemptCountScrubResultsJson &&&a qId=" << _queryId << " jId=" << _jobId - << " attempt=" << _attemptCount); - if (increase) { ++_attemptCount; } - LOGS(_log, LOG_LVL_ERROR, - "JobDescription::incrAttemptCountScrubResultsJson &&&b qId=" << _queryId << " jId=" << _jobId - << " attempt=" << _attemptCount); if (_attemptCount >= MAX_JOB_ATTEMPTS) { LOGS(_log, LOG_LVL_ERROR, "attemptCount greater than maximum number of retries " << _attemptCount); return false; @@ -100,7 +93,6 @@ bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr _attemptCount, _czarId); LOGS(_log, LOG_LVL_DEBUG, "JobDescription::" << __func__ << " js=" << (*js)); _jsForWorker = js; - return true; } diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index fccc36d85..a3a208c1d 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -29,6 +29,7 @@ // System headers #include +#include #include // Third party headers @@ -91,7 +92,7 @@ class JobDescription { std::shared_ptr getJsForWorker() { return _jsForWorker; } - void resetJsForWorker() { _jsForWorker.reset(); } // TODO:UJ may need mutex for _jsForWorker //&&& + void resetJsForWorker() { _jsForWorker.reset(); } friend std::ostream& operator<<(std::ostream& os, JobDescription const& jd); diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index fae46550e..e91d11e48 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -59,13 +59,14 @@ UberJob::Ptr UberJob::create(Executive::Ptr const& executive, std::shared_ptr const& respHandler, int queryId, int uberJobId, qmeta::CzarId czarId, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData) { - UberJob::Ptr uJob(new UberJob(executive, respHandler, queryId, uberJobId, czarId, workerData)); + UberJob::Ptr uJob(new UberJob(executive, respHandler, queryId, uberJobId, czarId, + executive->getUjRowLimit(), workerData)); uJob->_setup(); return uJob; } UberJob::UberJob(Executive::Ptr const& executive, std::shared_ptr const& respHandler, - int queryId, int uberJobId, qmeta::CzarId czarId, + int queryId, int uberJobId, qmeta::CzarId czarId, int rowLimit, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData) : JobBase(), _executive(executive), @@ -73,8 +74,8 @@ UberJob::UberJob(Executive::Ptr const& executive, std::shared_ptrreplicationInstanceId()}, {"auth_key", czarConfig->replicationAuthKey()}, {"worker", ciwId}, - {"czar", + {"czarinfo", {{"name", czarConfig->name()}, {"id", czarConfig->id()}, {"management-port", czarConfig->replicationHttpPort()}, @@ -126,6 +127,7 @@ bool UberJob::runUberJob() { {{"queryid", _queryId}, {"uberjobid", _uberJobId}, {"czarid", _czarId}, + {"rowlimit", _rowLimit}, {"jobs", json::array()}}}}; auto& jsUberJob = request["uberjob"]; @@ -158,10 +160,10 @@ bool UberJob::runUberJob() { if (0 != response.at("success").get()) { transmitSuccess = true; } else { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " response success=0"); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " ujresponse success=0"); } } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, requestContext + " failed, ex: " + ex.what()); + LOGS(_log, LOG_LVL_WARN, requestContext + " ujresponse failed, ex: " + ex.what()); exceptionWhat = ex.what(); } if (!transmitSuccess) { diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 8069b4d3a..0015a772a 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -118,7 +118,7 @@ class UberJob : public JobBase { private: UberJob(std::shared_ptr const& executive, std::shared_ptr const& respHandler, - int queryId, int uberJobId, qmeta::CzarId czarId, + int queryId, int uberJobId, qmeta::CzarId czarId, int rowLimit, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData); /// Used to setup elements that can't be done in the constructor. @@ -157,9 +157,9 @@ class UberJob : public JobBase { QueryId const _queryId; UberJobId const _uberJobId; qmeta::CzarId const _czarId; + int const _rowLimit; std::string const _idStr; - std::shared_ptr _qdispPool; // TODO:UJ remove when possible. &&& delete // Map of workerData czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 8b9997888..1253da089 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -32,7 +32,6 @@ target_sources(util PRIVATE Timer.cc TimeUtils.cc WorkQueue.cc - xrootd.cc ) target_link_libraries(util PUBLIC diff --git a/src/util/ConfigValMap.h b/src/util/ConfigValMap.h index ef027b925..f962c35e6 100644 --- a/src/util/ConfigValMap.h +++ b/src/util/ConfigValMap.h @@ -50,6 +50,8 @@ class ConfigValMap; /// Base class for storing values, usually from configuration files, that have /// identifiers consisting of a `section` and a `name`. /// This class is meant to be used with ConfigValMap. +/// TODO:UJ a command line argument can be added to this and if the command +/// line argument is found, it will override the value in the file. class ConfigVal { public: using Ptr = std::shared_ptr; diff --git a/src/util/InstanceCount.cc b/src/util/InstanceCount.cc index af9f0f8dd..9940523f3 100644 --- a/src/util/InstanceCount.cc +++ b/src/util/InstanceCount.cc @@ -31,8 +31,7 @@ void InstanceCount::_increment(std::string const& source) { auto ret = _instances.insert(entry); auto iter = ret.first; iter->second += 1; - LOGS(_log, LOG_LVL_WARN, - "InstanceCount " << source << " " << iter->first << "=" << iter->second); // LockupDB INFO + LOGS(_log, LOG_LVL_DEBUG, "InstanceCount " << source << " " << iter->first << "=" << iter->second); } InstanceCount::~InstanceCount() { @@ -40,8 +39,7 @@ InstanceCount::~InstanceCount() { auto iter = _instances.find(_className); if (iter != _instances.end()) { iter->second -= 1; - LOGS(_log, LOG_LVL_WARN, - "~InstanceCount " << iter->first << "=" << iter->second << " : " << *this); // LockupDB INFO + LOGS(_log, LOG_LVL_DEBUG, "~InstanceCount " << iter->first << "=" << iter->second << " : " << *this); if (iter->second == 0) { _instances.erase(_className); } diff --git a/src/util/xrootd.cc b/src/util/xrootd.cc deleted file mode 100644 index bde271719..000000000 --- a/src/util/xrootd.cc +++ /dev/null @@ -1,91 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2009-2015 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -// xrootd.h -- Helper funcitons for xrootd-based dispatch - -#include "util/xrootd.h" - -// System headers -#include -#include - -// Third-party headers -#include "boost/format.hpp" - -/// &&& file seems unused, delete if possible - -namespace lsst::qserv::util { - -std::string makeUrl(char const* hostport, char const* typeStr, int chunk) { - std::stringstream s; - s << chunk; - // boost::format version is 5x slower. - // std::string s = (boost::format("%d") % chunk).str(); - return makeUrl(hostport, typeStr, s.str()); -} - -std::string makeUrl(char const* hostport, std::string const& path) { - return makeUrl(hostport, nullptr, path); -} - -std::string makeUrl(char const* hostport, char const* typeStr, std::string const& s, char mode) { - // typeStr is either "query" or "result" - if (!hostport) { - hostport = ::getenv("QSERV_XRD"); - if (!hostport) { - // use local host name if nothing is specified - hostport = "localhost:1094"; - } - } -#if 0 - char* user = "qsmaster"; - boost::format f("xroot://%s@%s//%s/%s"); - return (f % user % hostport % typeStr % s).str(); -#else - // This is ~8.5x faster than the boost::format version. - std::string pfx = "xroot://"; - std::string user("qsmaster"); - std::string tstr; - std::string ret; - if (typeStr) tstr = typeStr; - - if (mode != '\0') { - user += "."; - user += mode; - } - ret.reserve(pfx.size() + user.size() + 1 + 2 + 1 + tstr.size() + s.size()); - ret += pfx; - ret += user; - ret += "@"; - ret += hostport; - ret += "/"; - if (typeStr) { - ret += "/"; - ret += typeStr; - ret += "/"; - } // else: assume s contains leading "/" - ret += s; - return ret; -#endif -} - -} // namespace lsst::qserv::util diff --git a/src/util/xrootd.h b/src/util/xrootd.h deleted file mode 100644 index 947db582b..000000000 --- a/src/util/xrootd.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- LSST-C++ -*- - -/* - * LSST Data Management System - * Copyright 2008, 2009, 2010 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_UTIL_XROOTD_H -#define LSST_QSERV_UTIL_XROOTD_H - -// xrootd.h : consolidates xrootd/lower-level helper functions (i.e., -// dealing with xrootd URLs) - -// Third-party headers -#include - -/// &&& file seems unused, delete if possible - -namespace lsst::qserv::util { - -std::string makeUrl(char const* hostport, char const* typeStr, int chunk); -std::string makeUrl(char const* hostport, char const* typeStr, std::string const& s, char mode = 0); -std::string makeUrl(char const* hostport, std::string const& path); - -} // namespace lsst::qserv::util - -#endif // LSST_QSERV_UTIL_XROOTD_H diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index 56ca743ee..030163d60 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -311,7 +311,12 @@ FileChannelShared::~FileChannelShared() { // dead it means there was a problem to process a query or send back a response // to Czar. In either case, the file would be useless and it has to be deleted // in order to avoid leaving unclaimed result files within the results folder. - if (isDead()) { + // + // _rowLimitComplete confuses things as it can cause other Tasks using this + // file to be cancelled, but the file should not be deleted until collected. + // In any case, the WorkerQueryStatusData message from the czar will delete + // the file when the user query completes. + if (isDead() && !_rowLimitComplete) { _removeFile(lock_guard(_tMtx)); } LOGS(_log, LOG_LVL_DEBUG, "~FileChannelShared end"); @@ -319,9 +324,21 @@ FileChannelShared::~FileChannelShared() { void FileChannelShared::setTaskCount(int taskCount) { _taskCount = taskCount; } -bool FileChannelShared::transmitTaskLast() { +bool FileChannelShared::transmitTaskLast(bool rowLimitComplete) { lock_guard const streamMutexLock(_streamMutex); ++_lastCount; + if (rowLimitComplete) { + // There are enough rows in the file so other tasks can be ignored. + if (_rowLimitComplete.exchange(true) == false) { + // This is TaskLast. + return true; + } else { + // A different task set _rowLimitComplete before + // this one. Since there can be only one TaskLast, + // it is not this one. + return false; + } + } bool lastTaskDone = _lastCount >= _taskCount; return lastTaskDone; } @@ -331,16 +348,26 @@ bool FileChannelShared::kill(string const& note) { return _kill(streamMutexLock, note); } -bool FileChannelShared::isDead() { return _dead; } +bool FileChannelShared::isDead() const { return _dead; } string FileChannelShared::makeIdStr(int qId, int jId) { string str("QID" + (qId == 0 ? "" : to_string(qId) + "#" + to_string(jId))); return str; } +bool FileChannelShared::isRowLimitComplete() const { + lock_guard const tMtxLock(_tMtx); + return _rowLimitComplete; +} + bool FileChannelShared::buildAndTransmitError(util::MultiError& multiErr, shared_ptr const& task, bool cancelled) { lock_guard const tMtxLock(_tMtx); + if (_rowLimitComplete) { + LOGS(_log, LOG_LVL_WARN, + __func__ << " already enough rows, this call likely a side effect" << task->getIdStr()); + return false; + } // Delete the result file as nobody will come looking for it. _kill(tMtxLock, " buildAndTransmitError"); return _uberJobData->responseError(multiErr, task, cancelled); @@ -369,6 +396,11 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptr const tMtxLockA(_tMtx); + if (_rowLimitComplete) { + LOGS(_log, LOG_LVL_DEBUG, __func__ << " already enough rows, returning " << task->getIdStr()); + // Deleting the file now could be risky. + return erred; + } util::Timer bufferFillT; bufferFillT.start(); @@ -391,7 +423,7 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptrgetMaxTableSize(); // Fail the operation if the amount of data in the result set exceeds the requested - // "large result" limit (in case if the one was specified). + // "large result" limit (in case one was specified). if (maxTableSize > 0 && bytesTransmitted > maxTableSize) { string const err = "The result set size " + to_string(bytesTransmitted) + " of a job exceeds the requested limit of " + to_string(maxTableSize) + @@ -402,18 +434,28 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptrgetRowLimit(); + bool rowLimitComplete = false; + if (ujRowLimit > 0 && _rowcount >= ujRowLimit) { + // There are enough rows to satisfy the query, so stop reading + hasMoreRows = false; + rowLimitComplete = true; + LOGS(_log, LOG_LVL_DEBUG, + __func__ << " enough rows for query rows=" << _rowcount << " " << task->getIdStr()); + } + // If no more rows are left in the task's result set then we need to check // if this is last task in a logical group of ones created for processing // the current request (note that certain classes of requests may require // more than one task for processing). - if (!hasMoreRows && transmitTaskLast()) { + if (!hasMoreRows && transmitTaskLast(rowLimitComplete)) { // Make sure the file is sync to disk before notifying Czar. _file.flush(); _file.close(); // Only the last ("summary") message, w/o any rows, is sent to the Czar to notify // it about the completion of the request. - if (!_sendResponse(tMtxLockA, task, cancelled, multiErr)) { + if (!_sendResponse(tMtxLockA, task, cancelled, multiErr, rowLimitComplete)) { LOGS(_log, LOG_LVL_ERROR, "Could not transmit the request completion message to Czar."); erred = true; break; @@ -437,7 +479,7 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptr const tMtxLockA(_tMtx); _removeFile(tMtxLockA); } @@ -450,7 +492,6 @@ bool FileChannelShared::_kill(lock_guard const& streamMutexLock, string c if (!oldVal) { LOGS(_log, LOG_LVL_WARN, "FileChannelShared first kill call " << note); } - _removeFile(streamMutexLock); return oldVal; } @@ -551,7 +592,7 @@ void FileChannelShared::_removeFile(lock_guard const& tMtxLock) { } bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ptr const& task, - bool cancelled, util::MultiError const& multiErr) { + bool cancelled, util::MultiError const& multiErr, bool mustSend) { auto const queryId = task->getQueryId(); auto const jobId = task->getJobId(); auto const idStr(makeIdStr(queryId, jobId)); @@ -570,13 +611,13 @@ bool FileChannelShared::_sendResponse(lock_guard const& tMtxLock, shared_ QSERV_LOGCONTEXT_QUERY_JOB(queryId, jobId); LOGS(_log, LOG_LVL_DEBUG, __func__); - if (isDead()) { + if (isDead() && !mustSend) { LOGS(_log, LOG_LVL_INFO, __func__ << ": aborting transmit since sendChannel is dead."); return false; } // Prepare the response object and serialize in into a message that will - // be sent to Czar. + // be sent to the Czar. string httpFileUrl = task->resultFileHttpUrl(); _uberJobData->responseFileReady(httpFileUrl, _rowcount, _transmitsize, _headerCount); return true; diff --git a/src/wbase/FileChannelShared.h b/src/wbase/FileChannelShared.h index fcffe4580..348eb3cb3 100644 --- a/src/wbase/FileChannelShared.h +++ b/src/wbase/FileChannelShared.h @@ -139,7 +139,9 @@ class FileChannelShared { int getTaskCount() const { return _taskCount; } /// @return true if this is the last task to call this - bool transmitTaskLast(); + /// @param rowLimitComplete - true means enough rows for the result are + /// already in the file, so other tasks can be ignored. + bool transmitTaskLast(bool rowLimitComplete); /// Return a normalized id string. static std::string makeIdStr(int qId, int jId); @@ -169,7 +171,12 @@ class FileChannelShared { bool kill(std::string const& note); /// @see wbase::SendChannel::isDead - bool isDead(); + bool isDead() const; + + /// Return true if there are enough rows in this result file to satisfy the + /// LIMIT portion of the query. + /// @See _rowLimitComplete + bool isRowLimitComplete() const; private: /// TODO:UJ delete sendchannel version of constructor when possible. @@ -233,10 +240,11 @@ class FileChannelShared { * @param task - a task that produced the result set * @param cancelled - request cancellaton flag (if any) * @param multiErr - a collector of any errors that were captured during result set processing + * @param mustSend - set to true if this message should be sent even if the query was cancelled. * @return 'true' if the operation was successfull */ bool _sendResponse(std::lock_guard const& tMtxLock, std::shared_ptr const& task, - bool cancelled, util::MultiError const& multiErr); + bool cancelled, util::MultiError const& multiErr, bool mustSend = false); mutable std::mutex _tMtx; ///< Protects data recording and Czar notification @@ -287,10 +295,15 @@ class FileChannelShared { // Counters reported to Czar in the only ("summary") message sent upon the completion // of all tasks of a query. - uint32_t _rowcount = 0; ///< The total numnber of rows in all result sets of a query. + int64_t _rowcount = 0; ///< The total numnber of rows in all result sets of a query. uint64_t _transmitsize = 0; ///< The total amount of data (bytes) in all result sets of a query. uint64_t _headerCount = 0; ///< Count of headers received. + /// _rowLimitComplete indicates that there is a LIMIT clause in the user query that + /// can be applied to the queries given to workers. It's important to apply it + /// when possible as an UberJob could have 1000 chunks and a LIMIT of 1, and it's + /// much faster to answer the query without scanning all 1000 chunks. + std::atomic _rowLimitComplete; std::atomic _dead{false}; ///< Set to true when the contents of the file are no longer useful. }; diff --git a/src/wbase/SendChannel.h b/src/wbase/SendChannel.h index de4724955..8ba90ea4a 100644 --- a/src/wbase/SendChannel.h +++ b/src/wbase/SendChannel.h @@ -44,7 +44,7 @@ class SendChannel { /// The following methods are used to send responses back to a request. /// (see newNopChannel and newStringChannel). - virtual bool send(char const* buf, int bufLen) = 0; //&&& delete + virtual bool send(char const* buf, int bufLen) = 0; // TODO:UJ remove + change unit tests /// Construct a new NopChannel that ignores everything it is asked to send static SendChannel::Ptr newNopChannel(); diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index c581229d0..10013d09b 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -142,7 +142,8 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun _scanInfo(scanInfo), _scanInteractive(scanInteractive), _queryStats(queryStats_), - _maxTableSize(maxTableSize * ::MB_SIZE_BYTES) { + _maxTableSize(maxTableSize * ::MB_SIZE_BYTES), + _rowLimit(ujData->getRowLimit()) { // These attributes will be passed back to Czar in the Protobuf response // to advice which result delivery channel to use. auto const workerConfig = wconfig::WorkerConfig::instance(); @@ -361,6 +362,7 @@ wpublish::QueryStatistics::Ptr Task::getQueryStats() const { /// Flag the Task as cancelled, try to stop the SQL query, and try to remove it from the schedule. void Task::cancel() { + // util::InstanceCount _ic{std::string("&&&icTask::cancel ") + getIdStr()}; if (_cancelled.exchange(true)) { // Was already cancelled. return; @@ -383,14 +385,11 @@ void Task::cancel() { } bool Task::checkCancelled() { - // A czar doesn't directly tell the worker the query is dead. - // A czar has XrdSsi kill the SsiRequest, which kills the - // sendChannel used by this task. sendChannel can be killed - // in other ways, however, without the sendChannel, this task - // has no way to return anything to the originating czar and - // may as well give up now. - if (_sendChannel == nullptr || _sendChannel->isDead()) { - // The sendChannel is dead, probably squashed by the czar. + // The czar does tell the worker a query id is cancelled. + // Returning true here indicates there's no point in doing + // any more processing for this Task. + if (_cancelled) return true; + if (_sendChannel == nullptr || _sendChannel->isDead() || _sendChannel->isRowLimitComplete()) { cancel(); } return _cancelled; diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 0a5932b9d..745f39025 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -46,6 +46,7 @@ #include "wbase/TaskState.h" #include "util/Histogram.h" #include "util/ThreadPool.h" +#include "util/InstanceCount.h" //&&& // Forward declarations namespace lsst::qserv::mysql { @@ -308,6 +309,11 @@ class Task : public util::CommandForThreadPool { setFunc(func); } + /// Returns the LIMIT of rows for the query enforceable at the worker, where values <= 0 indicate + /// that there is no limit to the number of rows sent back by the worker. + /// @see UberJobData::getRowLimit() + int getRowLimit() { return _rowLimit; } + private: std::shared_ptr _sendChannel; ///< Send channel. @@ -372,7 +378,11 @@ class Task : public util::CommandForThreadPool { /// Time stamp for when `_booted` is set to true, otherwise meaningless. TIMEPOINT _bootedTime; + /// When > 0, indicates maximum number of rows needed for a result. + int const _rowLimit; + bool _unitTest = false; ///< + // util::InstanceCount _ic{std::string("&&&icTask ") + getIdStr()}; }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index d4cb0c734..93570d657 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -55,14 +55,16 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wbase.UberJobData"); namespace lsst::qserv::wbase { UberJobData::UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, - std::string czarHost, int czarPort, uint64_t queryId, std::string const& workerId, - std::shared_ptr const& foreman, std::string const& authKey) + std::string czarHost, int czarPort, uint64_t queryId, int rowLimit, + std::string const& workerId, std::shared_ptr const& foreman, + std::string const& authKey) : _uberJobId(uberJobId), _czarName(czarName), _czarId(czarId), _czarHost(czarHost), _czarPort(czarPort), _queryId(queryId), + _rowLimit(rowLimit), _workerId(workerId), _authKey(authKey), _foreman(foreman), @@ -90,6 +92,7 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount cName(__func__) << " _foreman was null, which should only happen in unit tests"); } + // &&&UJFileResp TODO:UJ file response json request = {{"version", http::MetaModule::version}, {"workerid", workerIdStr}, {"auth_key", _authKey}, @@ -188,6 +191,7 @@ string UJTransmitCmd::cName(const char* funcN) const { void UJTransmitCmd::action(util::CmdData* data) { // Make certain _selfPtr is reset before leaving this function. // If a retry is needed, duplicate() is called. + util::InstanceCount ic_(cName(__func__) + " &&&ic " + _requestStr + " url=" + _url); class ResetSelf { public: ResetSelf(UJTransmitCmd* ujtCmd) : _ujtCmd(ujtCmd) {} @@ -207,6 +211,7 @@ void UJTransmitCmd::action(util::CmdData* data) { try { json const response = client.readAsJson(); if (0 != response.at("success").get()) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& success url=" << _url); transmitSuccess = true; } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 76d335411..472a8c6a1 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -38,7 +38,6 @@ #include "qmeta/types.h" #include "util/QdispPool.h" #include "wbase/SendChannel.h" -#include "util/InstanceCount.h" namespace lsst::qserv { @@ -66,11 +65,11 @@ class UberJobData : public std::enable_shared_from_this { UberJobData(UberJobData const&) = delete; static Ptr create(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, - std::string const& czarHost, int czarPort, uint64_t queryId, + std::string const& czarHost, int czarPort, uint64_t queryId, int rowLimit, std::string const& workerId, std::shared_ptr const& foreman, std::string const& authKey) { - return Ptr(new UberJobData(uberJobId, czarName, czarId, czarHost, czarPort, queryId, workerId, - foreman, authKey)); + return Ptr(new UberJobData(uberJobId, czarName, czarId, czarHost, czarPort, queryId, rowLimit, + workerId, foreman, authKey)); } /// Set file channel for this UberJob void setFileChannelShared(std::shared_ptr const& fileChannelShared); @@ -105,9 +104,15 @@ class UberJobData : public std::enable_shared_from_this { /// &&& doc void cancelAllTasks(); + /// Returns the LIMIT of rows for the query enforceable at the worker, where values <= 0 indicate + /// that there is no limit to the number of rows sent back by the worker. + /// Workers can only safely limit rows for queries that have the LIMIT clause without other related + /// clauses like ORDER BY. + int getRowLimit() { return _rowLimit; } + private: UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, std::string czarHost, - int czarPort, uint64_t queryId, std::string const& workerId, + int czarPort, uint64_t queryId, int rowLimit, std::string const& workerId, std::shared_ptr const& foreman, std::string const& authKey); /// &&& doc @@ -121,6 +126,7 @@ class UberJobData : public std::enable_shared_from_this { std::string const _czarHost; int const _czarPort; QueryId const _queryId; + int const _rowLimit; ///< If > 0, only read this many rows before return the results. std::string const _workerId; std::string const _authKey; @@ -196,7 +202,6 @@ class UJTransmitCmd : public util::PriorityCommand { std::string const _requestContext; std::string const _requestStr; int _attemptCount = 0; ///< How many attempts have been made to transmit this. - util::InstanceCount _ic{cName("UJTransmitCmd&&&")}; }; } // namespace lsst::qserv::wbase diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index 7e3ab7b76..8fdd5194c 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -134,7 +134,6 @@ void QueryRunner::_setDb() { util::TimerHistogram memWaitHisto("memWait Hist", {1, 5, 10, 20, 40}); bool QueryRunner::runQuery() { - util::InstanceCount ic(to_string(_task->getQueryId()) + "_rq_LDB"); // LockupDB util::HoldTrack::Mark runQueryMarkA(ERR_LOC, "runQuery " + to_string(_task->getQueryId())); QSERV_LOGCONTEXT_QUERY_JOB(_task->getQueryId(), _task->getJobId()); LOGS(_log, LOG_LVL_TRACE, @@ -257,7 +256,7 @@ bool QueryRunner::_dispatchChannel() { // Ideally, hold it until moving on to the next chunk. Try to clean up ChunkResource code. auto taskSched = _task->getTaskScheduler(); - if (!_cancelled && !_task->getSendChannel()->isDead()) { + if (!_cancelled && !_task->checkCancelled()) { string const& query = _task->getQueryString(); util::Timer primeT; primeT.start(); diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index c59182858..927109647 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -110,6 +110,7 @@ struct Fixture { bool const lockInMemory = false; string const resultName = "resName"; string const authKey = "noAuthKey"; + int const rowLimit = 0; }; shared_ptr newTaskJson(MsgInfo const& mInfo) { @@ -234,9 +235,9 @@ BOOST_AUTO_TEST_CASE(Simple) { shared_ptr crm = ChunkResourceMgr::newMgr(backend); SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); auto const queries = queriesAndChunks(); - auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, - mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, - mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + auto ujData = lsst::qserv::wbase::UberJobData::create( + mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, + mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); lsst::qserv::proto::ScanInfo scanInfo; scanInfo.scanRating = mInfo.scanRating; scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); @@ -249,21 +250,6 @@ BOOST_AUTO_TEST_CASE(Simple) { } BOOST_AUTO_TEST_CASE(Output) { - /* &&& - WorkerConfig::create(); - string out; - shared_ptr msg(newTaskMsg()); - shared_ptr sendC(SendChannel::newStringChannel(out)); - auto sc = FileChannelShared::create(sendC, msg->czarid()); - FakeBackend::Ptr backend = make_shared(); - shared_ptr crm = ChunkResourceMgr::newMgr(backend); - SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); - auto const queries = queriesAndChunks(); - auto taskVect = Task::createTasks(msg, sc, crm, newMySqlConfig(), sqlConnMgr, queries); - Task::Ptr task = taskVect[0]; - QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); - BOOST_CHECK(a->runQuery()); - */ WorkerConfig::create(); string out; MsgInfo mInfo; @@ -274,9 +260,9 @@ BOOST_AUTO_TEST_CASE(Output) { shared_ptr crm = ChunkResourceMgr::newMgr(backend); SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); auto const queries = queriesAndChunks(); - auto ujData = lsst::qserv::wbase::UberJobData::create(mInfo.uberJobId, mInfo.czarName, mInfo.czarId, - mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, - mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + auto ujData = lsst::qserv::wbase::UberJobData::create( + mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, + mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); lsst::qserv::proto::ScanInfo scanInfo; scanInfo.scanRating = mInfo.scanRating; scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); diff --git a/src/wpublish/QueryStatistics.h b/src/wpublish/QueryStatistics.h index 9d208e037..668b4b412 100644 --- a/src/wpublish/QueryStatistics.h +++ b/src/wpublish/QueryStatistics.h @@ -41,7 +41,6 @@ #include "global/intTypes.h" #include "wbase/Task.h" #include "wsched/SchedulerBase.h" -#include "util/InstanceCount.h" //&&& namespace lsst::qserv::wbase { class UserQueryInfo; diff --git a/src/xrdsvc/HttpReplicaMgtModule.h b/src/xrdsvc/HttpReplicaMgtModule.h index b089d069c..ac58a5828 100644 --- a/src/xrdsvc/HttpReplicaMgtModule.h +++ b/src/xrdsvc/HttpReplicaMgtModule.h @@ -184,7 +184,7 @@ class HttpReplicaMgtModule : public xrdsvc::HttpModule { */ void _modifyChunk(std::string const& func, int chunk, std::string const& database, Direction direction); - bool _dataContext = false; // &&& + bool _dataContext = false; }; } // namespace lsst::qserv::xrdsvc diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 224862ab3..88848e238 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -113,7 +113,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { auto const& jsReq = body().objJson; string const targetWorkerId = body().required("worker"); - http::RequestBodyJSON rbCzar(body().required("czar")); + http::RequestBodyJSON rbCzar(body().required("czarinfo")); auto czarName = rbCzar.required("name"); auto czarId = rbCzar.required("id"); auto czarPort = rbCzar.required("management-port"); @@ -121,14 +121,15 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { LOGS(_log, LOG_LVL_TRACE, __func__ << " czar n=" << czarName << " id=" << czarId << " p=" << czarPort << " h=" << czarHostName); - http::RequestBodyJSON rbUberJob(body().required("uberjob")); auto ujQueryId = rbUberJob.required("queryid"); auto ujId = rbUberJob.required("uberjobid"); auto ujCzarId = rbUberJob.required("czarid"); + auto ujRowLimit = rbUberJob.required("rowlimit"); auto ujJobs = rbUberJob.required("jobs"); LOGS(_log, LOG_LVL_TRACE, - __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId); + __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId + << " rowlimit=" << ujRowLimit); // Get or create QueryStatistics and UserQueryInfo instances. auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzarId); @@ -144,7 +145,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { } auto ujData = wbase::UberJobData::create(ujId, czarName, czarId, czarHostName, czarPort, ujQueryId, - targetWorkerId, foreman(), authKey()); + ujRowLimit, targetWorkerId, foreman(), authKey()); // Find the entry for this queryId, creat a new one if needed. userQueryInfo->addUberJob(ujData); @@ -243,8 +244,6 @@ json HttpWorkerCzarModule::_queryStatus() { } json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { - LOGS(_log, LOG_LVL_ERROR, "&&& HttpWorkerCzarModule::_handleQueryStatus"); - json jsRet; auto now = CLOCK::now(); auto const workerConfig = wconfig::WorkerConfig::instance(); @@ -256,6 +255,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { replicationAuthKey, now); auto const czInfo = wqsData->getCzInfo(); + LOGS(_log, LOG_LVL_TRACE, " HttpWorkerCzarModule::_handleQueryStatus req=" << jsReq.dump()); CzarIdType czId = czInfo->czId; wcontrol::WCzarInfoMap::Ptr wCzarMap = foreman()->getWCzarInfoMap(); wcontrol::WCzarInfo::Ptr wCzarInfo = wCzarMap->getWCzarInfo(czId); @@ -263,7 +263,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // For all queryId and czarId items, if the item can't be found, it is simply ignored. Anything that // is missed will eventually be picked up by other mechanisms, such as results being rejected - // by the czar. + // by the czar. This almost never happen, but the system should respond gracefully. // If a czar was restarted, cancel and delete the abandoned items. if (wqsData->isCzarRestart()) { @@ -279,30 +279,31 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { // appropriate queries and tasks as needed. auto const queriesAndChunks = foreman()->queriesAndChunks(); vector cancelledList; - // Cancelled queries where we want to keep the files - lock_guard mapLg(wqsData->mapMtx); - for (auto const& [dkQid, dkTm] : wqsData->qIdDoneKeepFiles) { - auto qStats = queriesAndChunks->addQueryId(dkQid, czId); - if (qStats != nullptr) { - auto uqInfo = qStats->getUserQueryInfo(); - if (uqInfo != nullptr) { - if (!uqInfo->getCancelledByCzar()) { - cancelledList.push_back(uqInfo); + vector deleteFilesList; + { + // Cancelled queries where we want to keep the files + lock_guard mapLg(wqsData->mapMtx); + for (auto const& [dkQid, dkTm] : wqsData->qIdDoneKeepFiles) { + auto qStats = queriesAndChunks->addQueryId(dkQid, czId); + if (qStats != nullptr) { + auto uqInfo = qStats->getUserQueryInfo(); + if (uqInfo != nullptr) { + if (!uqInfo->getCancelledByCzar()) { + cancelledList.push_back(uqInfo); + } } } } - } - - vector deleteFilesList; - for (auto const& [dkQid, dkTm] : wqsData->qIdDoneDeleteFiles) { - auto qStats = queriesAndChunks->addQueryId(dkQid, czId); - if (qStats != nullptr) { - auto uqInfo = qStats->getUserQueryInfo(); - if (uqInfo != nullptr) { - if (!uqInfo->getCancelledByCzar()) { - cancelledList.push_back(uqInfo); + for (auto const& [dkQid, dkTm] : wqsData->qIdDoneDeleteFiles) { + auto qStats = queriesAndChunks->addQueryId(dkQid, czId); + if (qStats != nullptr) { + auto uqInfo = qStats->getUserQueryInfo(); + if (uqInfo != nullptr) { + if (!uqInfo->getCancelledByCzar()) { + cancelledList.push_back(uqInfo); + } + deleteFilesList.push_back(uqInfo); } - deleteFilesList.push_back(uqInfo); } } } @@ -337,14 +338,11 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { QueryId qId = uqiPtr->getQueryId(); wbase::FileChannelShared::cleanUpResults(czarId, qId); } - // Syntax errors in the message would throw invalid_argument, which is handled elsewhere. // Return a message containing lists of the queries that were cancelled. jsRet = wqsData->serializeResponseJson(foreman()->getWorkerStartupTime()); - wCzarInfo->sendWorkerCzarComIssueIfNeeded(wqsData->getWInfo(), wqsData->getCzInfo()); - return jsRet; } From 662e5aa4a1456a2e3cc5a5defd4abb188d56886c Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 18 Oct 2024 13:34:52 -0700 Subject: [PATCH 11/22] Rebase. --- src/http/BaseModule.h | 2 ++ src/http/Module.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/http/BaseModule.h b/src/http/BaseModule.h index 21e1b75ad..6c678ce6e 100644 --- a/src/http/BaseModule.h +++ b/src/http/BaseModule.h @@ -221,6 +221,8 @@ class BaseModule { */ void sendData(nlohmann::json& result); + std::string authKey() const { return _authKey; } + private: // Input parameters std::string const _authKey; diff --git a/src/http/Module.h b/src/http/Module.h index e761afd7c..815c1dd1f 100644 --- a/src/http/Module.h +++ b/src/http/Module.h @@ -93,7 +93,7 @@ class Module : public BaseModule { */ virtual void sendResponse(std::string const& content, std::string const& contentType) = 0; - std::string authKey() const { return _authKey; } + //&&&std::string authKey() const { return _authKey; } private: /** From 82811de8bcc2e577956fcf724b6308614e09d896 Mon Sep 17 00:00:00 2001 From: John Gates Date: Mon, 21 Oct 2024 12:49:45 -0700 Subject: [PATCH 12/22] Added comments and removed dead code. --- src/ccontrol/UserQuerySelect.cc | 7 +- src/czar/ActiveWorker.cc | 3 +- src/czar/ActiveWorker.h | 46 +++++---- src/czar/Czar.h | 7 +- src/czar/CzarChunkMap.cc | 15 ++- src/czar/CzarChunkMap.h | 11 ++- src/czar/CzarRegistry.cc | 7 +- src/czar/CzarRegistry.h | 8 +- src/global/CMakeLists.txt | 9 -- src/global/ResourceUnit.cc | 127 +------------------------ src/global/ResourceUnit.h | 33 +------ src/global/testResourceUnit.cc | 91 ------------------ src/http/Module.h | 2 - src/http/WorkerQueryStatusData.cc | 35 +++---- src/http/WorkerQueryStatusData.h | 145 ++++++++++++++++++----------- src/http/testStatusData.cc | 3 +- src/qdisp/CzarStats.cc | 14 +-- src/qdisp/CzarStats.h | 4 +- src/qdisp/Executive.cc | 19 ++-- src/qdisp/Executive.h | 12 +-- src/qdisp/JobQuery.cc | 37 ++++---- src/qdisp/UberJob.cc | 8 +- src/qproc/TaskMsgFactory.h | 5 +- src/rproc/InfileMerger.cc | 4 +- src/util/Mutex.cc | 2 +- src/util/Mutex.h | 33 +++++-- src/util/testMutex.cc | 42 +++++---- src/wbase/Task.cc | 1 - src/wbase/Task.h | 2 - src/wbase/UberJobData.cc | 2 - src/wbase/UberJobData.h | 17 +++- src/wbase/UserQueryInfo.h | 4 +- src/wcontrol/WorkerStats.cc | 6 +- src/wcontrol/WorkerStats.h | 2 +- src/wdb/testQuerySql.cc | 101 -------------------- src/wpublish/ChunkInventory.cc | 20 ---- src/wpublish/QueriesAndChunks.h | 3 +- src/wpublish/QueryStatistics.h | 3 +- src/xrdsvc/HttpWorkerCzarModule.cc | 2 +- src/xrdsvc/HttpWorkerCzarModule.h | 6 +- src/xrdsvc/SsiProvider.cc | 51 +--------- 41 files changed, 306 insertions(+), 643 deletions(-) delete mode 100644 src/global/testResourceUnit.cc delete mode 100644 src/wdb/testQuerySql.cc diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index beef84f21..a22eb7b88 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -303,12 +303,7 @@ void UserQuerySelect::submit() { } /// At this point the executive has a map of all jobs with the chunkIds as the key. - // TODO:UJ _maxCHunksPerUberJob maybe put in config??? or set on command line?? - // Different queries may benefit from different values - // Such as LIMIT=1 may work best with this at 1, where - // 100 would be better for others. - // &&& - _maxChunksPerUberJob = 2; + _maxChunksPerUberJob = 2; // &&& set in config // This is needed to prevent Czar::_monitor from starting things before they are ready. _executive->setReadyToExecute(); buildAndSendUberJobs(); diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index 6c33f616a..ef6302767 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -177,7 +177,8 @@ void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " read end"); if (0 != response.at("success").get()) { bool startupTimeChanged = false; - tie(transmitSuccess, startupTimeChanged) = _wqsData->handleResponseJson(response); + startupTimeChanged = _wqsData->handleResponseJson(response); + transmitSuccess = true; if (startupTimeChanged) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " worker startupTime changed, likely rebooted."); // kill all incomplete UberJobs on this worker. diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 630a10eae..b2e1f8c6c 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -38,6 +38,10 @@ // This header declarations namespace lsst::qserv::czar { +/// This class is used to track information important to the czar and a +/// specific worker. Primarily the czar cares about the worker being alive +/// and informing the worker that various query IDs and UberJobs +/// have finished or need to be cancelled. /// - maintain list of done/cancelled queries for an active worker, and send /// that list to the worker. Once the worker has accepted the list, remove /// all of those queryId's from the list. @@ -52,8 +56,8 @@ namespace lsst::qserv::czar { /// other workers know their UberJobs are dead because the worker killed /// them. If the worker isn't told, it will continue working on /// the UberJob until it finishes, and then find out the UberJob was killed -/// when it tries to return results to the czar (worker should delete files -/// for said UberJob at that point). +/// when it tries to return results to the czar. The worker should delete +/// files for said UberJob at that point). /// So, this should be very rare, only results in extra load, and therefore /// is a low priority. /// @@ -65,8 +69,8 @@ namespace lsst::qserv::czar { /// /// When a worker becomes DEAD: (see Czar::_monitor). /// - Affected UberJobs are killed. -/// - maps are remade without the dead workers -/// - uberjobs built to handle unassigned jobs. +/// - UberJobs are built to handle unassigned jobs where dead workers are skipped and +/// the jobs are assigned to alternate workers. /// class ActiveWorker : public std::enable_shared_from_this { public: @@ -150,15 +154,16 @@ class ActiveWorker : public std::enable_shared_from_this { } } - /// &&& doc + /// Change the state to `newState` and log if it is different. /// _aMtx must be held before calling. void _changeStateTo(State newState, double secsSinceUpdate, std::string const& note); - /// &&& doc + /// Send the `jsWorkerReqPtr` json message to the worker referenced by `wInf` to + /// transmit the `_wqsData` state. void _sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, std::shared_ptr const& jsWorkerReqPtr); - /// &&& doc + /// Dump a log string for this object. /// _aMtx must be held before calling. std::string _dump() const; @@ -171,11 +176,10 @@ class ActiveWorker : public std::enable_shared_from_this { mutable std::mutex _aMtx; ///< protects _wInfo, _state, _qIdDoneKeepFiles, _qIdDoneDeleteFiles }; -/// &&& doc -/// Maintain a list of all workers, indicating which are considered active. Communication -/// problems with workers could cause interesting race conditions, so workers will remain -/// on the list for a very long time after they have disappeared in the off chance they -/// come back from the dead. +/// This class maintains a list of all workers, indicating which are considered active. +/// Communication problems with workers could cause interesting race conditions, so +/// workers will remain on the list for a very long time after they have disappeared +/// in case they come back from the dead. class ActiveWorkerMap { public: using Ptr = std::shared_ptr; @@ -186,7 +190,8 @@ class ActiveWorkerMap { std::string cName(const char* fName) { return std::string("ActiveWorkerMap::") + fName + " "; } - /// &&& doc + /// Use information gathered from the registry to update the map. The registry + /// contains last contact time (used for determining aliveness) and worker contact information. void updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey); @@ -195,16 +200,23 @@ class ActiveWorkerMap { /// should be cancelled. void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId); - /// &&& doc + /// Return a pointer to the `ActiveWorker` associated with `workerId`. ActiveWorker::Ptr getActiveWorker(std::string const& workerId) const; - // &&& doc + /// Call `updateStateAndSendMessages` for all workers in this map. void sendActiveWorkersMessages(); - /// &&& doc + /// Add `qId` to the list of query ids where the worker can throw away all related + /// Tasks and result files. This is used for all completed user queries and cancelled + /// user queries. void addToDoneDeleteFiles(QueryId qId); - /// &&& doc + /// Add `qId` to the list of query ids where the worker must hold onto result + /// files but all incomplete Tasks can be stopped. This is used for `rowLimitComplete` + /// where enough rows have been found to complete a user query with a LIMIT + ///clause. The czar may still need to collect the result files from the worker. + /// Once the czar has completed the user query, the `qId` will be added to + /// `addToDoneDeleteFiles` so the workers will delete the files. void addToDoneKeepFiles(QueryId qId); private: diff --git a/src/czar/Czar.h b/src/czar/Czar.h index b913a8fbf..6574f33b6 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -149,7 +149,12 @@ class Czar { std::map> getExecMapCopy() const; - /// &&& doc + /// This function kills incomplete UberJobs associated with `workerId`. + /// This is done when it is believed a worker has died. The executive + /// un-assignes the Jobs associated with the UberJobs and then + /// adds the ids to lists for the affected worker. If the worker + /// reconnects, it will stop work on those UberJobs when it gets the + /// list. void killIncompleteUbjerJobsOn(std::string const& workerId); std::shared_ptr getQdispPool() const { return _qdispPool; } diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 5487f48ec..58c675262 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -186,12 +186,17 @@ shared_ptr CzarChunkMap::organize() { // - _workerChunkMap has a map of workerData by worker id with each worker having a map of ChunkData // - _chunkMap has a map of all chunkData by chunk id // - chunksSortedBySize a list of chunks sorted with largest first. - // From here need to assign shared scan chunk priority - // Go through the chunksSortedBySize list and assign each chunk to worker that has it with the smallest - // totalScanSize. + // From here need to assign shared scan chunk priority (i.e. the worker + // that will handle the chunk in shared scans, unless it is dead.) + // Go through the chunksSortedBySize list and assign each chunk to worker that has both: + // - a copy of the chunk + // - the worker currently has the smallest totalScanSize. + // When this is done, all workers should have lists of chunks with similar total sizes + // and missing chunks should be empty. for (auto&& chunkData : *chunksSortedBySize) { SizeT smallest = std::numeric_limits::max(); WorkerChunksData::Ptr smallestWkr = nullptr; + // Find worker with smallest total size. for (auto&& [wkrId, wkrDataWeak] : chunkData->_workerHasThisMap) { auto wkrData = wkrDataWeak.lock(); if (wkrData == nullptr) { @@ -245,7 +250,7 @@ bool CzarChunkMap::WorkerChunksData::isDead() { auto czarPtr = Czar::getCzar(); if (czarPtr == nullptr) { LOGS(_log, LOG_LVL_ERROR, - cName(__func__) << " czarPtr is null, this should only hap[pen in unit test."); + cName(__func__) << " czarPtr is null, this should only happen in unit test."); return false; } auto awMap = Czar::getCzar()->getActiveWorkerMap(); @@ -386,7 +391,7 @@ std::shared_ptr CzarFamilyMap::makeNewMaps( } } - // this needs to be done for each CzarChunkMap in the family map. + // This needs to be done for each CzarChunkMap in the family map. for (auto&& [familyName, chunkMapPtr] : *newFamilyMap) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " working on " << familyName); auto missing = chunkMapPtr->organize(); diff --git a/src/czar/CzarChunkMap.h b/src/czar/CzarChunkMap.h index 97e864855..28bc02319 100644 --- a/src/czar/CzarChunkMap.h +++ b/src/czar/CzarChunkMap.h @@ -142,7 +142,7 @@ class CzarChunkMap { /// accessed in a full table scan on this worker. SizeT getSharedScanTotalSize() const { return _sharedScanTotalSize; } - /// &&& doc + /// Return true if this worker is dead, according to `ActiveWorkerMap`. bool isDead(); /// Return a reference to `_sharedScanChunkMap`. A copy of the pointer @@ -163,7 +163,7 @@ class CzarChunkMap { /// Map of chunks this worker will handle during shared scans. /// Since scans are done in order of chunk id numbers, it helps /// to have this in chunk id number order. - /// At some point, thus should be sent to workers so they + /// At some point, this should be sent to workers so they /// can make more accurate time estimates for chunk completion. std::map _sharedScanChunkMap; @@ -206,7 +206,9 @@ class CzarChunkMap { } /// Use the information from the registry to `organize` `_chunkMap` and `_workerChunkMap` - /// into their expected formats. + /// into their expected formats, which also should define where a chunk is always + /// run during shared scans. + /// This is a critical function for defining which workers will handle which jobs. /// @return a vector of ChunkData::Ptr of chunks where no worker was found. std::shared_ptr organize(); @@ -296,6 +298,9 @@ class CzarFamilyMap { /// Make a new FamilyMapType map including ChunkMap and WorkerChunkMap from the data /// in `qChunkMap`. Each family has its own ChunkMap and WorkerChunkMap. + /// + /// NOTE: This is likely an expensive operation and should probably only + /// be called if new workers have been added or chunks have been moved. std::shared_ptr makeNewMaps(qmeta::QMetaChunkMap const& qChunkMap); /// Insert the new element described by the parameters into the `newFamilyMap` as appropriate. diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 72b845001..432dfb2aa 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -71,7 +71,7 @@ CzarRegistry::~CzarRegistry() { } http::WorkerContactInfo::WCMapPtr CzarRegistry::getWorkerContactMap() const { - std::lock_guard lockG(_cmapMtx); + lock_guard lockG(_cmapMtx); return _contactMap; } @@ -136,7 +136,7 @@ void CzarRegistry::_registryWorkerInfoLoop() { auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), util::get_current_host_fqdn(), czarStartTime); - lock_guard lck(_cmapMtx); + lock_guard lck(_cmapMtx); if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { _contactMap = wMap; _latestMapUpdate = CLOCK::now(); @@ -181,6 +181,7 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json } bool CzarRegistry::_compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const { + VMUTEX_HELD(_cmapMtx); if (_contactMap == nullptr) { // If _contactMap is null, it needs to be replaced. return false; @@ -205,7 +206,7 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const http::WorkerContactInfo::WCMapPtr contMap = nullptr; while (contMap == nullptr) { { - std::lock_guard lockG(_cmapMtx); + lock_guard lockG(_cmapMtx); contMap = _contactMap; } if (contMap == nullptr) { diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index bc8b6dc6d..aef90ea44 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -36,6 +36,7 @@ // Qserv headers #include "czar/ActiveWorker.h" #include "global/clock_defs.h" +#include "util/Mutex.h" namespace lsst::qserv::cconfig { class CzarConfig; @@ -77,7 +78,9 @@ class CzarRegistry { /// function will wait forever for a valid contact map to be ready. http::WorkerContactInfo::WCMapPtr waitForWorkerContactMap() const; - /// &&& doc + /// Send all live workers the `WorkerQueryStatusData` message for + /// that worker. This may result in the worker sending back the + /// `WorkerCzarComIssue` message if there were communication problems. void sendActiveWorkersMessages(); /// Add the query id to the list of queries to end on workers and @@ -117,8 +120,7 @@ class CzarRegistry { http::WorkerContactInfo::WCMapPtr _contactMap; TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to ///< WorkerContactInfo update. - // &&& review how this _mapMtx is used, probably locks for too long a period. - mutable std::mutex _cmapMtx; /// Protects _contactMap, _latestUpdate + mutable MUTEX _cmapMtx; /// Protects _contactMap, _latestUpdate /// Map for tracking worker aliveness, it has its own internal mutex. std::shared_ptr const _activeWorkerMap; diff --git a/src/global/CMakeLists.txt b/src/global/CMakeLists.txt index 96d7ff015..43d843047 100644 --- a/src/global/CMakeLists.txt +++ b/src/global/CMakeLists.txt @@ -11,12 +11,3 @@ target_sources(global PRIVATE target_link_libraries(global PUBLIC log ) - -add_executable(testResourceUnit testResourceUnit.cc) - -target_link_libraries(testResourceUnit - global - Boost::unit_test_framework -) - -add_test(NAME testResourceUnit COMMAND testResourceUnit) diff --git a/src/global/ResourceUnit.cc b/src/global/ResourceUnit.cc index 64144b843..816d469ff 100644 --- a/src/global/ResourceUnit.cc +++ b/src/global/ResourceUnit.cc @@ -31,43 +31,6 @@ namespace lsst::qserv { -////////////////////////////////////////////////////////////////////// -// lsst::qserv::ResourceUnit::Tokenizer -// A simple class to tokenize paths. -////////////////////////////////////////////////////////////////////// -class ResourceUnit::Tokenizer { -public: - Tokenizer(std::string const& s, char sep = '/') : _cursor(0), _next(0), _s(s), _sep(sep) { _seek(); } - - std::string token() { return _s.substr(_cursor, _next - _cursor); } - - int tokenAsInt() { - int num; - std::istringstream csm(token()); - csm >> num; - return num; - } - - void next() { - assert(!done()); - _cursor = _next + 1; - _seek(); - } - - bool done() { return _next == std::string::npos; } - -private: - void _seek() { _next = _s.find_first_of(_sep, _cursor); } - - std::string::size_type _cursor; - std::string::size_type _next; - std::string const _s; - char const _sep; -}; - -////////////////////////////////////////////////////////////////////// -ResourceUnit::ResourceUnit(std::string const& path) : _unitType(GARBAGE), _chunk(-1) { _setFromPath(path); } - std::string ResourceUnit::path() const { std::stringstream ss; ss << _pathSep << prefix(_unitType); @@ -90,13 +53,6 @@ std::string ResourceUnit::path() const { return ss.str(); } -std::string ResourceUnit::var(std::string const& key) const { - VarMap::const_iterator ci = _vars.find(key); - if (ci != _vars.end()) { - return ci->second; - } - return std::string(); -} std::string ResourceUnit::prefix(UnitType const& r) { switch (r) { @@ -116,94 +72,13 @@ std::string ResourceUnit::makePath(int chunk, std::string const& db) { return _pathSep + prefix(UnitType::DBCHUNK) + _pathSep + db + _pathSep + std::to_string(chunk); } + void ResourceUnit::setAsDbChunk(std::string const& db, int chunk) { _unitType = DBCHUNK; _db = db; _chunk = chunk; } -bool ResourceUnit::_markGarbageIfDone(Tokenizer& t) { - if (t.done()) { - _unitType = GARBAGE; - return true; - } - return false; -} - -void ResourceUnit::_setFromPath(std::string const& path) { - std::string rTypeString; - Tokenizer t(path, _pathSep); - if (!t.token().empty()) { // Expect leading separator (should start with /) - _unitType = UNKNOWN; - return; - } - if (_markGarbageIfDone(t)) { - return; - } // Consider using GOTO structure. - t.next(); - rTypeString = t.token(); - if (rTypeString == prefix(DBCHUNK)) { - // XrdSsi query - if (_markGarbageIfDone(t)) { - return; - } - _unitType = DBCHUNK; - t.next(); - _db = t.token(); - if (_db.empty()) { - _unitType = GARBAGE; - return; - } - if (_markGarbageIfDone(t)) { - return; - } - t.next(); - if (t.token().empty()) { - _unitType = GARBAGE; - return; - } - _chunk = t.tokenAsInt(); - _ingestLeafAndKeys(t.token()); - } else if (rTypeString == prefix(QUERY)) { - _unitType = QUERY; - if (!t.done()) { - _unitType = GARBAGE; - return; - } - } else { - _unitType = GARBAGE; - } -} - -/// Ingest key-value pairs from a string including the last portion of the path, -/// e.g., somenumber?key1=val1&key2=val2 -void ResourceUnit::_ingestLeafAndKeys(std::string const& leafPlusKeys) { - std::string::size_type start; - start = leafPlusKeys.find_first_of(_varSep, 0); - _vars.clear(); - - if (start == std::string::npos) { // No keys found - return; - } - ++start; - Tokenizer t(leafPlusKeys.substr(start), _varDelim); - for (std::string defn = t.token(); !defn.empty(); t.next()) { - _ingestKeyStr(defn); - } -} - -/// Ingest key-value pairs from a packed key-value representation. -/// e.g., key1=val1&key2=val2 -void ResourceUnit::_ingestKeyStr(std::string const& keyStr) { - std::string::size_type equalsPos; - equalsPos = keyStr.find_first_of('='); - if (equalsPos == std::string::npos) { // No = clause, value-less key. - _vars[keyStr] = std::string(); // empty insert. - } else { - _vars[keyStr.substr(0, equalsPos)] = keyStr.substr(equalsPos + 1); - } -} - std::ostream& operator<<(std::ostream& os, ResourceUnit const& ru) { return os << "Resource(" << ru.path() << ")"; } diff --git a/src/global/ResourceUnit.h b/src/global/ResourceUnit.h index c9f983740..50cd69b0e 100644 --- a/src/global/ResourceUnit.h +++ b/src/global/ResourceUnit.h @@ -33,22 +33,13 @@ namespace lsst::qserv { -/// ResourceUnit contains a name for an XrdSsi-resolvable resource unit. -//// -/// Not sure this belongs in global, but czar, worker both need it. -/// Other components may as well. -//// -/// Note that while key-value specifiers are parsed from the path string at -/// construction, the code for generating a path that includes the key-value -/// portion is not implemented. It is unclear whether we need the generation -/// capability, now that key-value pairs can be packed in protobufs messages. -class ResourceUnit { // TODO:UJ &&& delete if possible +/// This class is used to store the database and chunk id of a resource. +class ResourceUnit { public: class Checker; enum UnitType { GARBAGE, DBCHUNK, UNKNOWN, QUERY }; ResourceUnit() = default; - explicit ResourceUnit(std::string const& path); ResourceUnit(ResourceUnit const&) = default; ResourceUnit& operator=(ResourceUnit const&) = default; ~ResourceUnit() = default; @@ -62,9 +53,6 @@ class ResourceUnit { // TODO:UJ &&& delete if possible std::string const& db() const { return _db; } int chunk() const { return _chunk; } - /// Lookup extended path variables (?k=val syntax) - std::string var(std::string const& key) const; - /// @return the path prefix element for a given request type. static std::string prefix(UnitType const& r); @@ -75,32 +63,15 @@ class ResourceUnit { // TODO:UJ &&& delete if possible void setAsDbChunk(std::string const& db, int chunk = DUMMY_CHUNK); private: - class Tokenizer; - void _setFromPath(std::string const& path); - void _ingestLeafAndKeys(std::string const& leafPlusKeys); - void _ingestKeyStr(std::string const& keyStr); - bool _markGarbageIfDone(Tokenizer& t); - UnitType _unitType = UnitType::GARBAGE; //< Type of unit std::string _db; //< for DBCHUNK type int _chunk = -1; //< for DBCHUNK type - typedef std::map VarMap; - VarMap _vars; //< Key-value specifiers - static char const _pathSep = '/'; - static char const _varSep = '?'; - static char const _varDelim = '&'; friend std::ostream& operator<<(std::ostream& os, ResourceUnit const& ru); }; -class ResourceUnit::Checker { -public: - virtual ~Checker() {} - virtual bool operator()(ResourceUnit const& ru) = 0; -}; - } // namespace lsst::qserv #endif // LSST_QSERV_RESOURCEUNIT_H diff --git a/src/global/testResourceUnit.cc b/src/global/testResourceUnit.cc deleted file mode 100644 index dfde0e3c2..000000000 --- a/src/global/testResourceUnit.cc +++ /dev/null @@ -1,91 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2015 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -/// testResourceUnit - -// Third-party headers - -// Qserv headers -#include "global/ResourceUnit.h" - -// Boost unit test header -#define BOOST_TEST_MODULE ResourceUnit_1 -#include - -namespace test = boost::test_tools; -using lsst::qserv::ResourceUnit; - -struct Fixture { - Fixture() : dummy(0) {} - - int dummy; - ~Fixture(void) {}; -}; -int const MAGIC_SIZE = 80; - -BOOST_FIXTURE_TEST_SUITE(Suite, Fixture) - -BOOST_AUTO_TEST_CASE(Garbage) { - char p[][MAGIC_SIZE] = {// Convert to std vector list init when available - // Missing chunk number - "/chk/qcase01", "/chk/abc/", - // Bad resource type - "/chk2/abc", "/abc/", "/abc/chk/g", - // Missing/bad params - "/q", "/q/", "/q/Hello", "/result", "/result/"}; - int const pSize = 10; - for (auto i = p, e = p + pSize; i != e; ++i) { - ResourceUnit r(*i); - BOOST_CHECK_MESSAGE(r.unitType() == ResourceUnit::GARBAGE, std::string("Expected garbage: ") + *i); - } -} - -BOOST_AUTO_TEST_CASE(DbChunk) { - char p[][MAGIC_SIZE] = { - "/chk/qcase01/123", - "/chk/abc/456", - }; - int const pSize = 2; - std::vector r; - for (auto i = p, e = p + pSize; i != e; ++i) { - r.push_back(ResourceUnit(*i)); - BOOST_CHECK_EQUAL(r.back().unitType(), ResourceUnit::DBCHUNK); - } - BOOST_CHECK_EQUAL(r[0].db(), "qcase01"); - BOOST_CHECK_EQUAL(r[1].db(), "abc"); - BOOST_CHECK_EQUAL(r[0].chunk(), 123); - BOOST_CHECK_EQUAL(r[1].chunk(), 456); - - r[0].setAsDbChunk("foo", 1111); - r[1].setAsDbChunk("bar", 968); - BOOST_CHECK_EQUAL(r[0].path(), "/chk/foo/1111"); - BOOST_CHECK_EQUAL(r[1].path(), "/chk/bar/968"); -} - -BOOST_AUTO_TEST_CASE(Query) { - ResourceUnit const res1("/query"); - BOOST_CHECK_EQUAL(res1.unitType(), ResourceUnit::QUERY); - ResourceUnit const res2("/query/abc"); - BOOST_CHECK_EQUAL(res2.unitType(), ResourceUnit::GARBAGE); -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/src/http/Module.h b/src/http/Module.h index 815c1dd1f..4d2f78a0b 100644 --- a/src/http/Module.h +++ b/src/http/Module.h @@ -93,8 +93,6 @@ class Module : public BaseModule { */ virtual void sendResponse(std::string const& content, std::string const& contentType) = 0; - //&&&std::string authKey() const { return _authKey; } - private: /** * Pull the raw request body and translate it into a JSON object. diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index ee251b048..2188920b5 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -75,7 +75,7 @@ std::string CzarContactInfo::dump() const { } json WorkerContactInfo::serializeJson() const { - lock_guard lg(_rMtx); + lock_guard lg(_rMtx); return _serializeJson(); } @@ -121,7 +121,7 @@ WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonWorker(nlohmann::json co } string WorkerContactInfo::dump() const { - lock_guard lg(_rMtx); + lock_guard lg(_rMtx); return _dump(); } @@ -143,7 +143,7 @@ shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { jsWorkerR["auth_key"] = _replicationAuthKey; jsWorkerR["czarinfo"] = _czInfo->serializeJson(); { - lock_guard lgI(_infoMtx); + lock_guard lgI(_infoMtx); if (_wInfo != nullptr) { jsWorkerR["workerinfo"] = _wInfo->serializeJson(); jsWorkerR["worker"] = _wInfo->wId; @@ -157,7 +157,7 @@ shared_ptr WorkerQueryStatusData::serializeJson(double maxLifetime) { addListsToJson(jsWorkerR, now, maxLifetime); if (czarCancelAfterRestart) { jsWorkerR["czarrestart"] = true; - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); jsWorkerR["czarrestartcancelczid"] = czarCancelAfterRestartCzId; jsWorkerR["czarrestartcancelqid"] = czarCancelAfterRestartQId; } else { @@ -171,7 +171,7 @@ void WorkerQueryStatusData::addListsToJson(json& jsWR, TIMEPOINT tmMark, double jsWR["qiddonekeepfiles"] = json::array(); jsWR["qiddonedeletefiles"] = json::array(); jsWR["qiddeaduberjobs"] = json::array(); - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); { auto& jsDoneKeep = jsWR["qiddonekeepfiles"]; auto iterDoneKeep = qIdDoneKeepFiles.begin(); @@ -282,7 +282,7 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json } void WorkerQueryStatusData::parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); parseListsInto(jsWR, updateTm, qIdDoneKeepFiles, qIdDoneDeleteFiles, qIdDeadUberJobs); } @@ -317,7 +317,7 @@ void WorkerQueryStatusData::parseListsInto(nlohmann::json const& jsWR, TIMEPOINT } void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); auto& ujMap = qIdDeadUberJobs[qId]; for (auto const ujId : ujIds) { ujMap[ujId] = tm; @@ -325,23 +325,23 @@ void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector } void WorkerQueryStatusData::addDeadUberJob(QueryId qId, UberJobId ujId, TIMEPOINT tm) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); auto& ujMap = qIdDeadUberJobs[qId]; ujMap[ujId] = tm; } void WorkerQueryStatusData::addToDoneDeleteFiles(QueryId qId) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); qIdDoneDeleteFiles[qId] = CLOCK::now(); } void WorkerQueryStatusData::addToDoneKeepFiles(QueryId qId) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); qIdDoneKeepFiles[qId] = CLOCK::now(); } void WorkerQueryStatusData::removeDeadUberJobsFor(QueryId qId) { - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); qIdDeadUberJobs.erase(qId); } @@ -359,14 +359,14 @@ json WorkerQueryStatusData::serializeResponseJson(uint64_t workerStartupTime) { return jsResp; } -std::pair WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { +bool WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { auto now = CLOCK::now(); std::map doneKeepF; std::map doneDeleteF; std::map> deadUberJobs; parseListsInto(jsResp, now, doneKeepF, doneDeleteF, deadUberJobs); - lock_guard mapLg(mapMtx); + lock_guard mapLg(mapMtx); // Remove entries from _qIdDoneKeepFiles for (auto const& [qId, tm] : doneKeepF) { qIdDoneKeepFiles.erase(qId); @@ -400,15 +400,16 @@ std::pair WorkerQueryStatusData::handleResponseJson(nlohmann::json c << " changed to=" << workerStartupTime << " Assuming worker restarted"); workerRestarted = true; } - return {true, workerRestarted}; + return workerRestarted; } string WorkerQueryStatusData::dump() const { - lock_guard lgI(_infoMtx); + lock_guard lgI(_infoMtx); return _dump(); } string WorkerQueryStatusData::_dump() const { + VMUTEX_HELD(_infoMtx); stringstream os; os << "ActiveWorker " << ((_wInfo == nullptr) ? "?" : _wInfo->dump()); return os.str(); @@ -417,7 +418,7 @@ string WorkerQueryStatusData::_dump() const { shared_ptr WorkerCzarComIssue::serializeJson() { shared_ptr jsCzarReqPtr = make_shared(); json& jsCzarR = *jsCzarReqPtr; - lock_guard _lgWciMtx(_wciMtx); + lock_guard _lgWciMtx(_wciMtx); if (_wInfo == nullptr || _czInfo == nullptr) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " _wInfo or _czInfo was null"); return jsCzarReqPtr; @@ -472,7 +473,7 @@ json WorkerCzarComIssue::serializeResponseJson() { } string WorkerCzarComIssue::dump() const { - lock_guard _lgWciMtx(_wciMtx); + lock_guard _lgWciMtx(_wciMtx); return _dump(); } diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index f128d6264..79d1e04f2 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -34,6 +34,7 @@ // qserv headers #include "global/clock_defs.h" #include "global/intTypes.h" +#include "util/Mutex.h" // This header declarations namespace lsst::qserv::http { @@ -48,7 +49,7 @@ class CzarContactInfo : public std::enable_shared_from_this { CzarContactInfo(CzarContactInfo const&) = default; CzarContactInfo& operator=(CzarContactInfo const&) = default; - /// &&& doc + /// Return true is elements, other than czStartupTime, are the same. bool compare(CzarContactInfo const& other) { return (czName == other.czName && czId == other.czId && czPort == other.czPort && czHostName == other.czHostName); @@ -67,7 +68,7 @@ class CzarContactInfo : public std::enable_shared_from_this { std::string const czHostName; ///< czar "management-host-name" uint64_t const czStartupTime; ///< czar startup time - /// &&& doc + /// Return a json version of the contents of this class. nlohmann::json serializeJson() const; std::string dump() const; @@ -95,13 +96,14 @@ class WorkerContactInfo { return Ptr(new WorkerContactInfo(wId_, wHost_, wManagementHost_, wPort_, updateTime_)); } - /// &&& doc Used to create WorkerQueryStatusData object from a registry json message. + /// This function creates a WorkerQueryStatusData object from a registry json message, + /// which is provided by the system registry. static Ptr createFromJsonRegistry(std::string const& wId_, nlohmann::json const& regJson); - /// &&& doc Used to create WorkerQueryStatusData object from a worker json message. + /// This function creates a WorkerQueryStatusData object from a worker json message. static Ptr createFromJsonWorker(nlohmann::json const& workerJson, TIMEPOINT updateTime); - /// &&& doc + /// Return a json version of the contents of this object. nlohmann::json serializeJson() const; std::string cName(const char* fn) { return std::string("WorkerContactInfo::") + fn; } @@ -109,24 +111,24 @@ class WorkerContactInfo { std::string const wId; ///< key, this is the one thing that cannot change. std::string getWHost() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _wHost; } std::string getWManagementHost() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _wManagementHost; } int getWPort() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _wPort; } - /// &&doc + /// Change host and port info to those provided in `other`. void changeBaseInfo(WorkerContactInfo const& other) { auto [oWId, oWHost, oWManagementHost, oWPort] = other.getAll(); - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); _wHost = oWHost; _wManagementHost = oWManagementHost; _wPort = oWPort; @@ -137,35 +139,35 @@ class WorkerContactInfo { /// @return _wManagementHost - management host /// @return _wPort - worker port std::tuple getAll() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return {wId, _wHost, _wManagementHost, _wPort}; } /// Return true if communication related items are the same. bool isSameContactInfo(WorkerContactInfo const& other) const { auto [oWId, oWHost, oWManagementHost, oWPort] = other.getAll(); - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return (wId == oWId && _wHost == oWHost && _wManagementHost == oWManagementHost && _wPort == oWPort); } void setRegUpdateTime(TIMEPOINT updateTime) { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); _regUpdateTime = updateTime; } TIMEPOINT getRegUpdateTime(TIMEPOINT updateTime) { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _regUpdateTime; } double timeSinceRegUpdateSeconds() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); double secs = std::chrono::duration(CLOCK::now() - _regUpdateTime).count(); return secs; } TIMEPOINT getRegUpdateTime() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _regUpdateTime; } @@ -174,7 +176,7 @@ class WorkerContactInfo { /// @return false indicates the worker was restarted and all associated jobs need /// re-assignment. bool checkWStartupTime(uint64_t startupTime) { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); if (_wStartupTime == startupTime) { return true; } @@ -187,7 +189,7 @@ class WorkerContactInfo { } uint64_t getWStartupTime() const { - std::lock_guard lg(_rMtx); + std::lock_guard lg(_rMtx); return _wStartupTime; } @@ -211,8 +213,7 @@ class WorkerContactInfo { int _wPort; ///< "management-port" entry. /// Last time the registry heard from this worker. The ActiveWorker class - /// will use this to determine the worker's state. - /// &&& Store in seconds since epoch to make atomic? + /// will use this to determine the worker's state (alive/dead). TIMEPOINT _regUpdateTime; /// "w-startup-time", it's value is set to zero until the real value is @@ -222,7 +223,7 @@ class WorkerContactInfo { /// foreman()->getStartupTime(); uint64_t _wStartupTime = 0; - mutable std::mutex _rMtx; ///< protects _regUpdate + mutable MUTEX _rMtx; ///< protects _regUpdate }; /// This classes purpose is to be a structure to store and transfer information @@ -244,47 +245,55 @@ class WorkerQueryStatusData { return Ptr(new WorkerQueryStatusData(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_)); } - /// &&& doc Used to create WorkerQueryStatusData object from a worker json message. + /// This function creates a WorkerQueryStatusData object from the worker json `czarJson`, the + /// other parameters are used to verify the json message. static Ptr createFromJson(nlohmann::json const& czarJson, std::string const& replicationInstanceId_, std::string const& replicationAuthKey_, TIMEPOINT updateTm); ~WorkerQueryStatusData() = default; void setWInfo(WorkerContactInfo::Ptr const& wInfo_) { - std::lock_guard lgI(_infoMtx); + std::lock_guard lgI(_infoMtx); if (_wInfo == nullptr) { _wInfo = wInfo_; return; } if (wInfo_ != nullptr) { - // This only change host and port values of _wInfo. + // This only changes host and port values of _wInfo. _wInfo->changeBaseInfo(*wInfo_); } } WorkerContactInfo::Ptr getWInfo() const { - std::lock_guard lgI(_infoMtx); + std::lock_guard lgI(_infoMtx); return _wInfo; } CzarContactInfo::Ptr getCzInfo() const { return _czInfo; } - /// doc &&& + /// `qId` and `ujId` identify a dead UberJob which is added to the list + /// of dead UberJobs for this worker. void addDeadUberJob(QueryId qId, UberJobId ujId, TIMEPOINT tm); - /// &&& doc + /// Add multiple UberJobIds for `qId` to the list of dead UberJobs for + /// this worker. void addDeadUberJobs(QueryId qId, std::vector ujIds, TIMEPOINT tm); - /// &&& doc + /// Add `qId` to the list of user queries where all Tasks can be stopped + /// and result files can be deleted. void addToDoneDeleteFiles(QueryId qId); - /// &&& doc + /// Add `qId` to the list of user queries where all Tasks can be stopped + /// but result files should be kept. void addToDoneKeepFiles(QueryId qId); - /// &&& doc + /// Remove all UberJobs from the list of dead UberJobs with QueryId `qId`. + /// There's no point in tracking individual UberJobs once the entire + /// user query is finished or cancelled as they will all be deleted by + /// `addToDoneDeleteFiles` void removeDeadUberJobsFor(QueryId qId); void setCzarCancelAfterRestart(CzarIdType czId, QueryId lastQId) { - std::lock_guard mapLg(mapMtx); + std::lock_guard mapLg(mapMtx); czarCancelAfterRestart = true; czarCancelAfterRestartCzId = czId; czarCancelAfterRestartQId = lastQId; @@ -304,17 +313,30 @@ class WorkerQueryStatusData { /// than maxLifetime. void addListsToJson(nlohmann::json& jsWR, TIMEPOINT tmMark, double maxLifetime); - /// &&& doc + /// Parse the lists in `jsWR` to populate the lists for qIdDoneKeepFiles, + /// qIdDoneDeleteFiles, and qIdDeadUberJobs. /// @throws std::invalid_argument void parseLists(nlohmann::json const& jsWR, TIMEPOINT updateTm); - /// &&& doc + /// Return a json object indicating the status of the message for the + /// original requester. nlohmann::json serializeResponseJson(uint64_t workerStartupTime); - /// &&& doc - std::pair handleResponseJson(nlohmann::json const& jsResp); - - /// &&& doc + /// Use the worker's response, `jsResp`, to update the status of this object. + /// The worker's response contains lists indicating what the worker + /// received from the czar's json message created with `serializeResponseJson`. + /// The czar can remove the ids from the lists as once the worker has + /// verified them. + /// @return transmitSuccess - true if the message was parsed successfully. + /// @return workerRestarted - true if `workerStartupTime` doesn't match, + /// indicating the worker has been restarted and the czar should + /// invalidate and re-assign all UberJobs associated with this + /// worker. + /// @throw invalid_argument if there are problems with json parsing. + bool handleResponseJson(nlohmann::json const& jsResp); + + /// Parse the contents of `jsWR` to fill the maps `doneKeepF`, `doneDeleteF`, + /// and `deadUberJobs`. static void parseListsInto(nlohmann::json const& jsWR, TIMEPOINT updateTm, std::map& doneKeepF, std::map& doneDeleteF, @@ -324,15 +346,27 @@ class WorkerQueryStatusData { // Making these private requires member functions to be written // that cause issues with linking. All of the workarounds are ugly. - std::map qIdDoneKeepFiles; ///< &&& doc - limit reached - std::map qIdDoneDeleteFiles; ///< &&& doc -cancelled/finished - std::map> qIdDeadUberJobs; ///< &&& doc + /// Map of QueryIds where the LIMIT clause has been satisfied so + /// that Tasks can be stopped but result files need to be kept. + std::map qIdDoneKeepFiles; + + /// Map fo QueryIds where Tasks can be stopped and files deleted, which is + /// used when user queries are cancelled or finished. + std::map qIdDoneDeleteFiles; + + /// Map used to indicated a specific UberJobs need to be killed. + std::map> qIdDeadUberJobs; + + /// If true, this indicates that this is a newly started czar and + /// the worker should stop all previous work associated with this + /// CzarId. std::atomic czarCancelAfterRestart = false; CzarIdType czarCancelAfterRestartCzId = 0; QueryId czarCancelAfterRestartQId = 0; + /// Protects _qIdDoneKeepFiles, _qIdDoneDeleteFiles, _qIdDeadUberJobs, /// and czarCancelAfter variables. - mutable std::mutex mapMtx; + mutable MUTEX mapMtx; private: WorkerQueryStatusData(WorkerContactInfo::Ptr const& wInfo_, CzarContactInfo::Ptr const& czInfo_, @@ -342,18 +376,18 @@ class WorkerQueryStatusData { _replicationInstanceId(replicationInstanceId_), _replicationAuthKey(replicationAuthKey_) {} - WorkerContactInfo::Ptr _wInfo; ///< &&& doc - CzarContactInfo::Ptr const _czInfo; //< &&& doc - mutable std::mutex _infoMtx; ///< protects wInfo + WorkerContactInfo::Ptr _wInfo; ///< Information needed to contact the worker. + CzarContactInfo::Ptr const _czInfo; ///< Information needed to contact the czar. + mutable MUTEX _infoMtx; ///< protects _wInfo - std::string const _replicationInstanceId; ///< &&& doc - std::string const _replicationAuthKey; ///< &&& doc + std::string const _replicationInstanceId; ///< Used for message verification. + std::string const _replicationAuthKey; ///< Used for message verification. /// _infoMtx must be locked before calling. std::string _dump() const; }; -/// &&& doc + /// This class is used to send/receive a message from the worker to a specific /// czar when there has been a communication issue with the worker sending UberJob /// file ready messages. If there have been timeouts, the worker will send this @@ -368,7 +402,7 @@ class WorkerQueryStatusData { /// TODO:UJ &&& will be added to this message. uber job file response /// Upon successful completion, the worker will clear all values set by the /// the czar. -/// This message is expected to only be needed rarely. +/// Currently, this message is expected to only be needed rarely. class WorkerCzarComIssue { public: using Ptr = std::shared_ptr; @@ -392,14 +426,14 @@ class WorkerCzarComIssue { bool getThoughtCzarWasDead() const { return _thoughtCzarWasDead; } - /// &&& doc + /// Return true if there is a reason this WorkerCzarComIssue should be sent to this czar. bool needToSend() const { std::lock_guard lg(_wciMtx); // TODO:UJ &&& or list of failed transmits not empty. return _thoughtCzarWasDead; } - /// &&& doc + /// Set the contact information for the appropriate czar and worker. void setContactInfo(WorkerContactInfo::Ptr const& wInfo_, CzarContactInfo::Ptr const& czInfo_) { std::lock_guard lgWci(_wciMtx); if (_wInfo == nullptr && wInfo_ != nullptr) _wInfo = wInfo_; @@ -416,10 +450,11 @@ class WorkerCzarComIssue { return _wInfo; } - /// &&& doc + /// Return a json version of the contents of this class. std::shared_ptr serializeJson(); - /// &&& doc + /// Return a json object indicating the status of the message for the + /// original requester. nlohmann::json serializeResponseJson(); std::string dump() const; @@ -432,14 +467,14 @@ class WorkerCzarComIssue { WorkerContactInfo::Ptr _wInfo; CzarContactInfo::Ptr _czInfo; - std::string const _replicationInstanceId; ///< &&& doc - std::string const _replicationAuthKey; ///< &&& doc + std::string const _replicationInstanceId; ///< Used for message verification. + std::string const _replicationAuthKey; ///< Used for message verification. /// Set to by the worker true if the czar was considered dead, and reset to false /// after the czar has acknowledged successful reception of this message. bool _thoughtCzarWasDead = false; - mutable std::mutex _wciMtx; ///< protects all members. + mutable MUTEX _wciMtx; ///< protects all members. }; } // namespace lsst::qserv::http diff --git a/src/http/testStatusData.cc b/src/http/testStatusData.cc index 1a03c1d90..ba537d3ea 100644 --- a/src/http/testStatusData.cc +++ b/src/http/testStatusData.cc @@ -134,8 +134,7 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { BOOST_REQUIRE(!wqsdA->qIdDeadUberJobs.empty()); wqsdA->handleResponseJson(jsWorkerResp); - auto [respSuccess, workerRestarted] = wqsdA->handleResponseJson(jsWorkerResp); - BOOST_REQUIRE(respSuccess == true); + auto workerRestarted = wqsdA->handleResponseJson(jsWorkerResp); BOOST_REQUIRE(workerRestarted == false); BOOST_REQUIRE(wqsdA->qIdDoneDeleteFiles.empty()); diff --git a/src/qdisp/CzarStats.cc b/src/qdisp/CzarStats.cc index ca741e83c..5285e5be7 100644 --- a/src/qdisp/CzarStats.cc +++ b/src/qdisp/CzarStats.cc @@ -46,10 +46,10 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.czar.CzarStats"); namespace lsst::qserv::qdisp { CzarStats::Ptr CzarStats::_globalCzarStats; -util::Mutex CzarStats::_globalMtx; +MUTEX CzarStats::_globalMtx; void CzarStats::setup(util::QdispPool::Ptr const& qdispPool) { - std::lock_guard lg(_globalMtx); + std::lock_guard lg(_globalMtx); if (_globalCzarStats != nullptr || qdispPool == nullptr) { throw util::Bug(ERR_LOC, "Error CzarStats::setup called after global pointer set or qdispPool=null."); } @@ -77,7 +77,7 @@ CzarStats::CzarStats(util::QdispPool::Ptr const& qdispPool) } CzarStats::Ptr CzarStats::get() { - std::lock_guard lg(_globalMtx); + std::lock_guard lg(_globalMtx); if (_globalCzarStats == nullptr) { throw util::Bug(ERR_LOC, "Error CzarStats::get called before CzarStats::setup."); } @@ -124,7 +124,7 @@ void CzarStats::addFileReadRate(double bytesPerSec) { void CzarStats::trackQueryProgress(QueryId qid) { if (qid == 0) return; uint64_t const currentTimestampMs = util::TimeUtils::now(); - std::lock_guard const lock(_queryProgressMtx); + std::lock_guard const lock(_queryProgressMtx); if (auto itr = _queryNumIncompleteJobs.find(qid); itr != _queryNumIncompleteJobs.end()) return; _queryNumIncompleteJobs[qid].emplace_back(currentTimestampMs, 0); } @@ -132,7 +132,7 @@ void CzarStats::trackQueryProgress(QueryId qid) { void CzarStats::updateQueryProgress(QueryId qid, int numUnfinishedJobs) { if (qid == 0) return; uint64_t const currentTimestampMs = util::TimeUtils::now(); - std::lock_guard const lock(_queryProgressMtx); + std::lock_guard const lock(_queryProgressMtx); if (auto itr = _queryNumIncompleteJobs.find(qid); itr != _queryNumIncompleteJobs.end()) { auto&& history = itr->second; if (history.empty() || (history.back().numJobs != numUnfinishedJobs)) { @@ -147,7 +147,7 @@ void CzarStats::untrackQueryProgress(QueryId qid) { if (qid == 0) return; unsigned int const lastSeconds = cconfig::CzarConfig::instance()->czarStatsRetainPeriodSec(); uint64_t const minTimestampMs = util::TimeUtils::now() - 1000 * lastSeconds; - std::lock_guard const lock(_queryProgressMtx); + std::lock_guard const lock(_queryProgressMtx); if (lastSeconds == 0) { // The query gets removed instantaneously if archiving is not enabled. if (auto itr = _queryNumIncompleteJobs.find(qid); itr != _queryNumIncompleteJobs.end()) { @@ -170,7 +170,7 @@ void CzarStats::untrackQueryProgress(QueryId qid) { CzarStats::QueryProgress CzarStats::getQueryProgress(QueryId qid, unsigned int lastSeconds) const { uint64_t const minTimestampMs = util::TimeUtils::now() - 1000 * lastSeconds; - std::lock_guard const lock(_queryProgressMtx); + std::lock_guard const lock(_queryProgressMtx); QueryProgress result; if (qid == 0) { if (lastSeconds == 0) { diff --git a/src/qdisp/CzarStats.h b/src/qdisp/CzarStats.h index c22eaf3d9..123654ece 100644 --- a/src/qdisp/CzarStats.h +++ b/src/qdisp/CzarStats.h @@ -211,7 +211,7 @@ class CzarStats : std::enable_shared_from_this { CzarStats(std::shared_ptr const& qdispPool); static Ptr _globalCzarStats; ///< Pointer to the global instance. - static util::Mutex _globalMtx; ///< Protects `_globalCzarStats` + static MUTEX _globalMtx; ///< Protects `_globalCzarStats` /// Connection to get information about the czar's pool of dispatch threads. std::shared_ptr _qdispPool; @@ -251,7 +251,7 @@ class CzarStats : std::enable_shared_from_this { // Query progress stats are recorded along with timestamps when changes // in previously captured counters are detected. - mutable util::Mutex _queryProgressMtx; ///< Protects _queryNumIncompleteJobs + mutable MUTEX _queryProgressMtx; ///< Protects _queryNumIncompleteJobs QueryProgress _queryNumIncompleteJobs; }; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 7653cc54e..83bbaadca 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -369,7 +369,7 @@ bool Executive::join() { if (sCount == _requestCount) { LOGS(_log, LOG_LVL_INFO, "Query execution succeeded all: " << _requestCount << " jobs dispatched and completed."); - } else if (isLimitRowComplete()) { + } else if (isRowLimitComplete()) { LOGS(_log, LOG_LVL_INFO, "Query execution succeeded enough (LIMIT): " << sCount << " jobs out of " << _requestCount << " completed."); @@ -381,14 +381,14 @@ bool Executive::join() { _empty = (sCount == _requestCount); LOGS(_log, LOG_LVL_DEBUG, "Flag set to _empty=" << _empty << ", sCount=" << sCount << ", requestCount=" << _requestCount); - return _empty || isLimitRowComplete(); + return _empty || isRowLimitComplete(); } void Executive::markCompleted(JobId jobId, bool success) { ResponseHandler::Error err; string idStr = QueryIdHelper::makeIdStr(_id, jobId); LOGS(_log, LOG_LVL_DEBUG, "Executive::markCompleted " << success); - if (!success && !isLimitRowComplete()) { + if (!success && !isRowLimitComplete()) { { lock_guard lock(_incompleteJobsMutex); auto iter = _incompleteJobs.find(jobId); @@ -428,7 +428,7 @@ void Executive::markCompleted(JobId jobId, bool success) { } } _unTrack(jobId); - if (!success && !isLimitRowComplete()) { + if (!success && !isRowLimitComplete()) { LOGS(_log, LOG_LVL_ERROR, "Executive: requesting squash, cause: " << " failed (code=" << err.getCode() << " " << err.getMsg() << ")"); @@ -456,10 +456,11 @@ void Executive::squash() { job->cancel(); } - // TODO:UJ - Send a message to all workers saying this czarId + queryId is cancelled. - // The workers will just mark all associated tasks as cancelled, and that should be it. - // Any message to this czar about this query should result in an error sent back to - // the worker as soon it can't locate an executive or the executive says cancelled. + // Send a message to all workers saying this czarId + queryId is cancelled. + // The workers will just mark all associated tasks as cancelled, and that should be it. + // Any message to this czar about this query should result in an error sent back to + // the worker as soon it can't locate an executive or the executive says it was + // cancelled. bool const deleteResults = true; sendWorkersEndMsg(deleteResults); LOGS(_log, LOG_LVL_DEBUG, "Executive::squash done"); @@ -603,7 +604,7 @@ void Executive::_unTrack(int jobId) { s = _getIncompleteJobsString(5); } } - bool logDebug = untracked || isLimitRowComplete(); + bool logDebug = untracked || isRowLimitComplete(); LOGS(_log, (logDebug ? LOG_LVL_DEBUG : LOG_LVL_WARN), "Executive UNTRACKING " << (untracked ? "success" : "failed") << "::" << s); // Every time a chunk completes, consider sending an update to QMeta. diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 1f0af6d26..9c76dae11 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -183,15 +183,13 @@ class Executive : public std::enable_shared_from_this { /// rows already read in. void checkLimitRowComplete(); - //&&&int getRowLimit() const { return _limit; } - /// Returns the maximum number of rows the worker needs for the LIMIT clause, or /// a value <= 0 there's no limit that can be applied at the worker. int getUjRowLimit() const; - /// @return _limitRowComplete, which can only be meaningful if the + /// @return _rowLimitComplete, which can only be meaningful if the /// user query has not been cancelled. - bool isLimitRowComplete() { return _limitRowComplete && !_cancelled; } + bool isRowLimitComplete() { return _rowLimitComplete && !_cancelled; } /// @return the value of _dataIgnoredCount int incrDataIgnoredCount() { return ++_dataIgnoredCount; } @@ -247,10 +245,10 @@ class Executive : public std::enable_shared_from_this { void _squashSuperfluous(); - /// @return previous value of _limitRowComplete while setting it to true. + /// @return previous value of _rowLimitComplete while setting it to true. /// This indicates that enough rows have been read to complete the user query /// with a LIMIT clause, and no group by or order by clause. - bool _setLimitRowComplete() { return _limitRowComplete.exchange(true); } + bool _setLimitRowComplete() { return _rowLimitComplete.exchange(true); } // for debugging void _printState(std::ostream& os); @@ -318,7 +316,7 @@ class Executive : public std::enable_shared_from_this { /// True if enough rows were read to satisfy a LIMIT query with /// no ORDER BY or GROUP BY clauses. - std::atomic _limitRowComplete{false}; + std::atomic _rowLimitComplete{false}; std::atomic _totalResultRows{0}; std::weak_ptr _querySession; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 114d3efef..e0f8ecfba 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -60,29 +60,28 @@ JobQuery::~JobQuery() { } /// Cancel response handling. Return true if this is the first time cancel has been called. -bool JobQuery::cancel(bool superfluous) { /// &&& This can probably be simplified more +bool JobQuery::cancel(bool superfluous) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel()"); if (_cancelled.exchange(true) == false) { - lock_guard lock(_rmutex); - // If _inSsi is true then this query request has been passed to SSI and - // _queryRequestPtr cannot be a nullptr. Cancellation is complicated. - bool cancelled = false; - - if (!cancelled) { - ostringstream os; - os << _idStr << " cancel"; - LOGS(_log, LOG_LVL_DEBUG, os.str()); - if (!superfluous) { - getDescription()->respHandler()->errorFlush(os.str(), -1); - } - auto executive = _executive.lock(); - if (executive == nullptr) { - LOGS(_log, LOG_LVL_ERROR, " can't markComplete cancelled, executive == nullptr"); - return false; - } - executive->markCompleted(getJobId(), false); + lock_guard lock(_rmutex); + + //&&&bool cancelled = false; + + //&&&if (!cancelled) { + ostringstream os; + os << _idStr << " cancel"; + LOGS(_log, LOG_LVL_DEBUG, os.str()); + if (!superfluous) { + getDescription()->respHandler()->errorFlush(os.str(), -1); + } + auto executive = _executive.lock(); + if (executive == nullptr) { + LOGS(_log, LOG_LVL_ERROR, " can't markComplete cancelled, executive == nullptr"); + return false; } + executive->markCompleted(getJobId(), false); + //&&&} if (!superfluous) { _jobDescription->respHandler()->processCancel(); } diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index e91d11e48..d2c14181e 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -294,7 +294,7 @@ json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_ return _importResultError(true, "cancelled", "Query cancelled - no executive"); } - if (exec->isLimitRowComplete()) { + if (exec->isRowLimitComplete()) { int dataIgnored = exec->incrDataIgnoredCount(); if ((dataIgnored - 1) % 1000 == 0) { LOGS(_log, LOG_LVL_INFO, @@ -359,14 +359,14 @@ json UberJob::workerError(int errorCode, string const& errorMsg) { return _workerErrorFinish(deleteData, "cancelled"); } - if (exec->isLimitRowComplete()) { + if (exec->isRowLimitComplete()) { int dataIgnored = exec->incrDataIgnoredCount(); if ((dataIgnored - 1) % 1000 == 0) { LOGS(_log, LOG_LVL_INFO, cName(__func__) << " ignoring, enough rows already " << "dataIgnored=" << dataIgnored); } - return _workerErrorFinish(keepData, "none", "limitRowComplete"); + return _workerErrorFinish(keepData, "none", "rowLimitComplete"); } // Currently there are no detectable recoverable errors from workers. The only @@ -477,7 +477,7 @@ void UberJob::killUberJob() { return; } - if (exec->isLimitRowComplete()) { + if (exec->isRowLimitComplete()) { int dataIgnored = exec->incrDataIgnoredCount(); if ((dataIgnored - 1) % 1000 == 0) { LOGS(_log, LOG_LVL_INFO, cName(__func__) << " ignoring, enough rows already."); diff --git a/src/qproc/TaskMsgFactory.h b/src/qproc/TaskMsgFactory.h index 1e3bfd3be..fe1f921f8 100644 --- a/src/qproc/TaskMsgFactory.h +++ b/src/qproc/TaskMsgFactory.h @@ -48,9 +48,8 @@ namespace lsst::qserv::qproc { class ChunkQuerySpec; -/// TaskMsgFactory is a factory for TaskMsg (protobuf) objects. -/// All member variables must be thread safe. -/// &&& fix doc +/// TaskMsgFactory makes json messages for the jobs to be sent to the workers, where +/// they will be used to create Tasks. class TaskMsgFactory { public: using Ptr = std::shared_ptr; diff --git a/src/rproc/InfileMerger.cc b/src/rproc/InfileMerger.cc index 4d32d3ad9..bf0f88d7c 100644 --- a/src/rproc/InfileMerger.cc +++ b/src/rproc/InfileMerger.cc @@ -239,7 +239,7 @@ bool InfileMerger::merge(proto::ResponseSummary const& responseSummary, return true; } auto executive = jq->getExecutive(); - if (executive == nullptr || executive->getCancelled() || executive->isLimitRowComplete()) { + if (executive == nullptr || executive->getCancelled() || executive->isRowLimitComplete()) { return true; } @@ -349,7 +349,7 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response return true; } auto executive = uberJob->getExecutive(); - if (executive == nullptr || executive->getCancelled() || executive->isLimitRowComplete()) { + if (executive == nullptr || executive->getCancelled() || executive->isRowLimitComplete()) { return true; } diff --git a/src/util/Mutex.cc b/src/util/Mutex.cc index cd60e2b0e..d7e46c0c1 100644 --- a/src/util/Mutex.cc +++ b/src/util/Mutex.cc @@ -47,7 +47,7 @@ void Lock::_lock() { _context << " LOCK[" << _mutex.id() << "]:1 " << " LOCKED: " << util::printable(Mutex::lockedId(), "", "", " ")); } - assert(!_mutex.lockedByCaller()); + assert(!_mutex.lockedByThread()); _mutex.lock(); if (!_context.empty()) { LOGS(_log, LOG_LVL_TRACE, diff --git a/src/util/Mutex.h b/src/util/Mutex.h index 0353f733a..1d6c0b046 100644 --- a/src/util/Mutex.h +++ b/src/util/Mutex.h @@ -32,13 +32,33 @@ #include "util/Bug.h" +#define USING_VMUTEX 1 // &&& Should be replaced by variable in build. + +#ifdef MUTEX_UNITTEST +#define USING_VMUTEX 1 +#endif + +#if USING_VMUTEX + +#define MUTEX util::Mutex + /// Used to verify a mutex is locked before accessing a protected variable. #define VMUTEX_HELD(vmtx) \ - if (!vmtx.lockedByCaller()) throw lsst::qserv::util::Bug(ERR_LOC, "mutex not locked!"); + if (!vmtx.lockedByThread()) throw lsst::qserv::util::Bug(ERR_LOC, "mutex not locked!"); /// Used to verify a mutex is not locked by this thread before locking a related mutex. #define VMUTEX_NOT_HELD(vmtx) \ - if (vmtx.lockedByCaller()) throw lsst::qserv::util::Bug(ERR_LOC, "mutex not free!"); + if (vmtx.lockedByThread()) throw lsst::qserv::util::Bug(ERR_LOC, "mutex not unlocked!"); + +#else // not USING_VMUTEX + +#define MUTEX std::mutex + +#define VMUTEX_HELD(vmtx) ; + +#define VMUTEX_NOT_HELD(vmtx) ; + +#endif // USING_VMUTEX // This header declarations namespace lsst::qserv::util { @@ -50,6 +70,8 @@ namespace lsst::qserv::util { /// Making VMutex a wrapper around std::mutex instead of a child causes lines /// like `std::lock_guard lck(_vmutex);` to be flagged as errors, /// which is desirable. +/// Unfortunately, VMutex won't work with condition_variable as those explicitly +/// expect std::mutex. class VMutex { public: explicit VMutex() {} @@ -75,8 +97,7 @@ class VMutex { } /// @return true if the mutex is locked by this thread. - /// TODO: Rename lockedByThread() - bool lockedByCaller() const { return _holder == std::this_thread::get_id(); } + bool lockedByThread() const { return _holder == std::this_thread::get_id(); } protected: std::atomic _holder; @@ -101,13 +122,13 @@ class Mutex : public VMutex { Mutex() : _id(nextId()) {} - /// Lock the mutext (replaces the corresponding method of the base class) + /// Lock the mutex (replaces the corresponding method of the base class) void lock() { VMutex::lock(); addCurrentId(); } - /// Release the mutext (replaces the corresponding method of the base class) + /// Release the mutex (replaces the corresponding method of the base class) void unlock() { removeCurrentId(); VMutex::unlock(); diff --git a/src/util/testMutex.cc b/src/util/testMutex.cc index 42220436e..6d22be4e7 100644 --- a/src/util/testMutex.cc +++ b/src/util/testMutex.cc @@ -33,6 +33,8 @@ // LSST headers #include "lsst/log/Log.h" +#define MUTEX_UNITTEST + // Qserv headers #include "util/BlockPost.h" #include "util/Mutex.h" @@ -62,12 +64,12 @@ BOOST_AUTO_TEST_CASE(MutexTest) { // The mutex won't be locked by anyone Mutex mtx1; - BOOST_CHECK(!mtx1.lockedByCaller()); + BOOST_CHECK(!mtx1.lockedByThread()); // The mutex will be locked by the current thread Mutex mtx2; lock_guard const lockGuard2(mtx2); - BOOST_CHECK(mtx2.lockedByCaller()); + BOOST_CHECK(mtx2.lockedByThread()); // Lock this mutex in each of two separate threads. Let each thread // to wait for a random period of time within some interval before @@ -85,18 +87,18 @@ BOOST_AUTO_TEST_CASE(MutexTest) { thread thr1([&mtx, &wasLockedBeforeBy1, &wasLockedAfterBy1]() { BlockPost blockPost(10, 20); blockPost.wait(); - wasLockedBeforeBy1 = mtx.lockedByCaller(); + wasLockedBeforeBy1 = mtx.lockedByThread(); lock_guard const lock(mtx); - wasLockedAfterBy1 = mtx.lockedByCaller(); + wasLockedAfterBy1 = mtx.lockedByThread(); }); bool wasLockedBeforeBy2 = false; bool wasLockedAfterBy2 = false; thread thr2([&mtx, &wasLockedBeforeBy2, &wasLockedAfterBy2]() { BlockPost blockPost(10, 20); blockPost.wait(); - wasLockedBeforeBy2 = mtx.lockedByCaller(); + wasLockedBeforeBy2 = mtx.lockedByThread(); lock_guard const lock(mtx); - wasLockedAfterBy2 = mtx.lockedByCaller(); + wasLockedAfterBy2 = mtx.lockedByThread(); }); thr1.join(); BOOST_CHECK(!wasLockedBeforeBy1); @@ -136,14 +138,14 @@ BOOST_AUTO_TEST_CASE(VMutexTest) { // The mutex won't be locked by anyone VMutex mtx1; - BOOST_CHECK(!mtx1.lockedByCaller()); + BOOST_CHECK(!mtx1.lockedByThread()); BOOST_CHECK_THROW(VMUTEX_HELD(mtx1), lsst::qserv::util::Bug); BOOST_REQUIRE_NO_THROW(VMUTEX_NOT_HELD(mtx1)); // The mutex will be locked by the current thread VMutex mtx2; lock_guard const lockGuard2(mtx2); - BOOST_CHECK(mtx2.lockedByCaller()); + BOOST_CHECK(mtx2.lockedByThread()); BOOST_REQUIRE_NO_THROW(VMUTEX_HELD(mtx2)); BOOST_CHECK_THROW(VMUTEX_NOT_HELD(mtx2), lsst::qserv::util::Bug); @@ -163,18 +165,18 @@ BOOST_AUTO_TEST_CASE(VMutexTest) { thread thr1([&mtx, &wasLockedBeforeBy1, &wasLockedAfterBy1]() { BlockPost blockPost(10, 20); blockPost.wait(); - wasLockedBeforeBy1 = mtx.lockedByCaller(); + wasLockedBeforeBy1 = mtx.lockedByThread(); lock_guard const lock(mtx); - wasLockedAfterBy1 = mtx.lockedByCaller(); + wasLockedAfterBy1 = mtx.lockedByThread(); }); bool wasLockedBeforeBy2 = false; bool wasLockedAfterBy2 = false; thread thr2([&mtx, &wasLockedBeforeBy2, &wasLockedAfterBy2]() { BlockPost blockPost(10, 20); blockPost.wait(); - wasLockedBeforeBy2 = mtx.lockedByCaller(); + wasLockedBeforeBy2 = mtx.lockedByThread(); lock_guard const lock(mtx); - wasLockedAfterBy2 = mtx.lockedByCaller(); + wasLockedAfterBy2 = mtx.lockedByThread(); }); thr1.join(); BOOST_CHECK(!wasLockedBeforeBy1); @@ -214,7 +216,7 @@ BOOST_AUTO_TEST_CASE(LockTest1) { // The mutex won't be locked by anyone Mutex mtx1; - BOOST_CHECK(not mtx1.lockedByCaller()); + BOOST_CHECK(not mtx1.lockedByThread()); // The mutex will be locked by the current thread Mutex mtx2; @@ -222,9 +224,9 @@ BOOST_AUTO_TEST_CASE(LockTest1) { // Do this in a nested block to ensure that lock object // gets destructed before the mutex. Lock const lock(mtx2, "LockTes1t: main thread"); - BOOST_CHECK(mtx2.lockedByCaller()); + BOOST_CHECK(mtx2.lockedByThread()); } - LOGS_DEBUG(!mtx2.lockedByCaller()); + LOGS_DEBUG(!mtx2.lockedByThread()); // Lock this mutex in each of two separate threads. Let each thread // to wait for a random period of time within some interval before @@ -247,7 +249,7 @@ BOOST_AUTO_TEST_CASE(LockTest1) { blockPost.wait(); Lock const lock(mtx, "LockTest1: thread 2"); }); - BOOST_CHECK(!mtx.lockedByCaller()); + BOOST_CHECK(!mtx.lockedByThread()); thr1.join(); thr2.join(); } @@ -284,7 +286,7 @@ BOOST_AUTO_TEST_CASE(LockTest2) { // The mutex won't be locked by anyone shared_ptr const mtx1 = make_shared(); - BOOST_CHECK(!mtx1->lockedByCaller()); + BOOST_CHECK(!mtx1->lockedByThread()); // The mutex will be locked by the current thread shared_ptr const mtx2 = make_shared(); @@ -292,9 +294,9 @@ BOOST_AUTO_TEST_CASE(LockTest2) { // Do this in a nested block to ensure that lock object // gets destructed before the mutex. Lock const lock(mtx2, "LockTes1t: main thread"); - BOOST_CHECK(mtx2->lockedByCaller()); + BOOST_CHECK(mtx2->lockedByThread()); } - BOOST_CHECK(!mtx2->lockedByCaller()); + BOOST_CHECK(!mtx2->lockedByThread()); // Lock this mutex in each of two separate threads. Let each thread // to wait for a random period of time within some interval before @@ -317,7 +319,7 @@ BOOST_AUTO_TEST_CASE(LockTest2) { blockPost.wait(); Lock const lock(mtx, "LockTest1: thread 2"); }); - BOOST_CHECK(!mtx->lockedByCaller()); + BOOST_CHECK(!mtx->lockedByThread()); thr1.join(); thr2.join(); } diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 10013d09b..6653e22a2 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -362,7 +362,6 @@ wpublish::QueryStatistics::Ptr Task::getQueryStats() const { /// Flag the Task as cancelled, try to stop the SQL query, and try to remove it from the schedule. void Task::cancel() { - // util::InstanceCount _ic{std::string("&&&icTask::cancel ") + getIdStr()}; if (_cancelled.exchange(true)) { // Was already cancelled. return; diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 745f39025..e3ba8b336 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -46,7 +46,6 @@ #include "wbase/TaskState.h" #include "util/Histogram.h" #include "util/ThreadPool.h" -#include "util/InstanceCount.h" //&&& // Forward declarations namespace lsst::qserv::mysql { @@ -382,7 +381,6 @@ class Task : public util::CommandForThreadPool { int const _rowLimit; bool _unitTest = false; ///< - // util::InstanceCount _ic{std::string("&&&icTask ") + getIdStr()}; }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index 93570d657..08551bf75 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -92,7 +92,6 @@ void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount cName(__func__) << " _foreman was null, which should only happen in unit tests"); } - // &&&UJFileResp TODO:UJ file response json request = {{"version", http::MetaModule::version}, {"workerid", workerIdStr}, {"auth_key", _authKey}, @@ -191,7 +190,6 @@ string UJTransmitCmd::cName(const char* funcN) const { void UJTransmitCmd::action(util::CmdData* data) { // Make certain _selfPtr is reset before leaving this function. // If a retry is needed, duplicate() is called. - util::InstanceCount ic_(cName(__func__) + " &&&ic " + _requestStr + " url=" + _url); class ResetSelf { public: ResetSelf(UJTransmitCmd* ujtCmd) : _ujtCmd(ujtCmd) {} diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 472a8c6a1..d4765fbbe 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -101,7 +101,7 @@ class UberJobData : public std::enable_shared_from_this { bool getCancelled() const { return _cancelled; } - /// &&& doc + /// Cancel all Tasks in this UberJob. void cancelAllTasks(); /// Returns the LIMIT of rows for the query enforceable at the worker, where values <= 0 indicate @@ -115,7 +115,7 @@ class UberJobData : public std::enable_shared_from_this { int czarPort, uint64_t queryId, int rowLimit, std::string const& workerId, std::shared_ptr const& foreman, std::string const& authKey); - /// &&& doc + /// Queue the response to be sent to the originating czar. void _queueUJResponse(http::Method method_, std::vector const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_); @@ -139,12 +139,15 @@ class UberJobData : public std::enable_shared_from_this { std::string const _idStr; - std::atomic _scanInteractive; ///< &&& doc + /// True if this an interactive (aka high priority) user query. + std::atomic _scanInteractive; std::atomic _cancelled{false}; ///< Set to true if this was cancelled. }; -/// &&& doc +/// This class puts the information about a locally finished UberJob into a command +/// so it can be put on a queue and sent to the originating czar. The information +/// being transmitted is usually the url for the result file or an error message. class UJTransmitCmd : public util::PriorityCommand { public: using Ptr = std::shared_ptr; @@ -171,7 +174,11 @@ class UJTransmitCmd : public util::PriorityCommand { /// Reset the self pointer so this object can be killed. void kill(); - /// &&& doc + /// This function makes a duplicate of the required information for transmition to the czar + /// in a new object and then increments the attempt count, so it is not a true copy. + /// Priority commands cannot be resent as there's information in them about which queue + /// to modify, so a fresh object is needed to re-send. The message and target czar remain + /// unchanged except for the atttempt count. Ptr duplicate(); private: diff --git a/src/wbase/UserQueryInfo.h b/src/wbase/UserQueryInfo.h index 0734f76c6..eb15de708 100644 --- a/src/wbase/UserQueryInfo.h +++ b/src/wbase/UserQueryInfo.h @@ -69,7 +69,7 @@ class UserQueryInfo { /// Add an UberJobData object to the UserQueryInfo. void addUberJob(std::shared_ptr const& ujData); - /// &&& doc + /// Return true if this user query was cancelled by its czar. bool getCancelledByCzar() const { return _cancelledByCzar; } /// The czar has cancelled this user query, all tasks need to @@ -81,7 +81,7 @@ class UserQueryInfo { /// information about which UberJobs are dead. void cancelAllUberJobs(); - /// &&& doc + /// Cancel a specific UberJob in this user query. void cancelUberJob(UberJobId ujId); bool isUberJobDead(UberJobId ujId) const; diff --git a/src/wcontrol/WorkerStats.cc b/src/wcontrol/WorkerStats.cc index 27055bd04..18a60b6a6 100644 --- a/src/wcontrol/WorkerStats.cc +++ b/src/wcontrol/WorkerStats.cc @@ -45,10 +45,10 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.wcontrol.WorkerStats"); namespace lsst::qserv::wcontrol { WorkerStats::Ptr WorkerStats::_globalWorkerStats; -util::Mutex WorkerStats::_globalMtx; +MUTEX WorkerStats::_globalMtx; void WorkerStats::setup() { - lock_guard lg(_globalMtx); + lock_guard lg(_globalMtx); if (_globalWorkerStats != nullptr) { throw util::Bug(ERR_LOC, "Error WorkerStats::setup called after global pointer set."); } @@ -70,7 +70,7 @@ WorkerStats::WorkerStats() { } WorkerStats::Ptr WorkerStats::get() { - std::lock_guard lg(_globalMtx); + std::lock_guard lg(_globalMtx); if (_globalWorkerStats == nullptr) { throw util::Bug(ERR_LOC, "Error CzarStats::get called before CzarStats::setup."); } diff --git a/src/wcontrol/WorkerStats.h b/src/wcontrol/WorkerStats.h index d61f45033..0acdaa2b3 100644 --- a/src/wcontrol/WorkerStats.h +++ b/src/wcontrol/WorkerStats.h @@ -77,7 +77,7 @@ class WorkerStats : std::enable_shared_from_this { private: WorkerStats(); static Ptr _globalWorkerStats; ///< Pointer to the global instance. - static util::Mutex _globalMtx; ///< Protects `_globalWorkerStats` + static MUTEX _globalMtx; ///< Protects `_globalWorkerStats` std::atomic _queueCount{ 0}; ///< Number of buffers on queues (there are many queues, one per ChannelShared) diff --git a/src/wdb/testQuerySql.cc b/src/wdb/testQuerySql.cc deleted file mode 100644 index f28d733d6..000000000 --- a/src/wdb/testQuerySql.cc +++ /dev/null @@ -1,101 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2013-2015 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -/** - * @brief Simple testing for class QuerySql - * - * @author Daniel L. Wang, SLAC - */ - -// Third-party headers - -// Qserv headers -#include "proto/worker.pb.h" -#include "wdb/QuerySql.h" -#include "wdb/QuerySql_Batch.h" - -// Boost unit test header -#define BOOST_TEST_MODULE QuerySql_1 -#include - -namespace test = boost::test_tools; - -//&&& delete file - -using lsst::qserv::proto::TaskMsg_Fragment; -using lsst::qserv::proto::TaskMsg_Subchunk; -using lsst::qserv::wdb::QuerySql; - -struct Fixture { - Fixture() { - defaultDb = "Winter"; - defaultResult = "myResult"; - } - ~Fixture() {} - - TaskMsg_Fragment makeFragment() { - TaskMsg_Fragment f; - // "Real" subchunk query text should include - // pre-substituted subchunk query text. - f.add_query("SELECT o1.*, o2.* FROM Object_1001 o1, Object_1001 o2;"); - f.set_resulttable("fragResult"); - TaskMsg_Subchunk sc; - sc.set_database("obsolete"); - lsst::qserv::proto::TaskMsg_Subchunk_DbTbl* dbTbl = sc.add_dbtbl(); - dbTbl->set_db(defaultDb); - dbTbl->set_tbl("Object"); - sc.add_id(1111); - sc.add_id(1222); - f.mutable_subchunks()->CopyFrom(sc); - return f; - } - - void printQsql(QuerySql const& q) { std::cout << "qsql=" << q << std::endl; } - std::string defaultDb; - std::string defaultResult; -}; - -BOOST_FIXTURE_TEST_SUITE(QuerySqlSuite, Fixture) - -BOOST_AUTO_TEST_CASE(Basic) { - std::shared_ptr qSql; - TaskMsg_Fragment frag = makeFragment(); - qSql = std::make_shared(defaultDb, 1001, frag, true, defaultResult); - BOOST_CHECK(qSql.get()); - printQsql(*qSql); -} - -BOOST_AUTO_TEST_CASE(QueryBatch) { - std::shared_ptr qSql; - TaskMsg_Fragment frag = makeFragment(); - qSql = std::make_shared(defaultDb, 1001, frag, true, defaultResult); - BOOST_CHECK(qSql.get()); - - QuerySql::Batch build("QueryBuildSub", qSql->buildList); - QuerySql::Batch& batch = build; - while (!batch.isDone()) { - std::string piece = batch.current(); - batch.next(); - } -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/src/wpublish/ChunkInventory.cc b/src/wpublish/ChunkInventory.cc index eb112303a..8bf735602 100644 --- a/src/wpublish/ChunkInventory.cc +++ b/src/wpublish/ChunkInventory.cc @@ -133,22 +133,6 @@ void fetchId(string const& instanceName, SqlConnection& sc, string& id) { LOGS(_log, LOG_LVL_WARN, "ChunkInventory couldn't find any a unique identifier of the worker"); } -class Validator : public lsst::qserv::ResourceUnit::Checker { -public: - Validator(lsst::qserv::wpublish::ChunkInventory& c) : chunkInventory(c) {} - virtual bool operator()(lsst::qserv::ResourceUnit const& ru) { - switch (ru.unitType()) { - case lsst::qserv::ResourceUnit::DBCHUNK: - return chunkInventory.has(ru.db(), ru.chunk()); - case lsst::qserv::ResourceUnit::QUERY: - return true; - default: - return false; - } - } - lsst::qserv::wpublish::ChunkInventory& chunkInventory; -}; - } // anonymous namespace namespace lsst::qserv::wpublish { @@ -284,10 +268,6 @@ bool ChunkInventory::has(string const& db, int chunk) const { return true; } -shared_ptr ChunkInventory::newValidator() { - return shared_ptr(new Validator(*this)); -} - void ChunkInventory::dbgPrint(ostream& os) const { lock_guard lock(_mtx); diff --git a/src/wpublish/QueriesAndChunks.h b/src/wpublish/QueriesAndChunks.h index e5f1814cd..b89458ba3 100644 --- a/src/wpublish/QueriesAndChunks.h +++ b/src/wpublish/QueriesAndChunks.h @@ -244,7 +244,8 @@ class QueriesAndChunks { }; using ScanTableSumsMap = std::map; - /// &&& doc + /// If the worker believes this czar has died, it calls this to stop + /// all Tasks associated with that czar. void killAllQueriesFromCzar(CzarIdType czarId); friend std::ostream& operator<<(std::ostream& os, QueriesAndChunks const& qc); diff --git a/src/wpublish/QueryStatistics.h b/src/wpublish/QueryStatistics.h index 668b4b412..5fd24ff00 100644 --- a/src/wpublish/QueryStatistics.h +++ b/src/wpublish/QueryStatistics.h @@ -197,7 +197,8 @@ class QueryStatistics { SchedTasksInfoMap _taskSchedInfoMap; ///< Map of task information ordered by scheduler name. - std::shared_ptr const _userQueryInfo; ///< &&& doc + /// Contains information common to all Tasks in this user query. + std::shared_ptr const _userQueryInfo; }; } // namespace lsst::qserv::wpublish diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 88848e238..594fcec5f 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -282,7 +282,7 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { vector deleteFilesList; { // Cancelled queries where we want to keep the files - lock_guard mapLg(wqsData->mapMtx); + lock_guard mapLg(wqsData->mapMtx); for (auto const& [dkQid, dkTm] : wqsData->qIdDoneKeepFiles) { auto qStats = queriesAndChunks->addQueryId(dkQid, czId); if (qStats != nullptr) { diff --git a/src/xrdsvc/HttpWorkerCzarModule.h b/src/xrdsvc/HttpWorkerCzarModule.h index 94b7f934a..500c905e5 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.h +++ b/src/xrdsvc/HttpWorkerCzarModule.h @@ -82,10 +82,12 @@ class HttpWorkerCzarModule : public xrdsvc::HttpModule { /// work of deciphering the message, creating UberJobData objects and Task objects. nlohmann::json _handleQueryJob(std::string const& func); - /// &&& doc + /// Verify some aspects of the query and call _handleQueryStatus nlohmann::json _queryStatus(); - /// &&& doc + /// Reconstruct the message, absorb the lists into this worker's state, + /// queue the ComIssue message and needed, and send the lists back to + /// the czar. nlohmann::json _handleQueryStatus(std::string const& func); }; diff --git a/src/xrdsvc/SsiProvider.cc b/src/xrdsvc/SsiProvider.cc index f7a068411..53463b29e 100644 --- a/src/xrdsvc/SsiProvider.cc +++ b/src/xrdsvc/SsiProvider.cc @@ -146,64 +146,17 @@ bool SsiProviderServer::Init(XrdSsiLogger* logP, XrdSsiCluster* clsP, std::strin XrdSsiProvider::rStat SsiProviderServer::QueryResource(char const* rName, char const* contact) { // Validate resource name based on its proposed type - - ResourceUnit ru(rName); - if (ru.unitType() == ResourceUnit::DBCHUNK) { - // Extract db and chunk from path and validate result - - // If the chunk exists on our node then tell the caller it is here. - if (_chunkInventory.has(ru.db(), ru.chunk())) { - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider Query " << rName << " present"); - return isPresent; - } - - // Tell the caller we do not have the chunk. - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider Query " << rName << " absent"); - return notPresent; - } else if (ru.unitType() == ResourceUnit::QUERY) { - return isPresent; - } - - // Treat other resources as absolute path names of files - boost::filesystem::path const path(rName); - if (path.is_absolute()) { - boost::system::error_code ec; - if (boost::filesystem::exists(path, ec) && !ec.value()) { - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider File Resource " << rName << " recognized"); - return isPresent; - } - } - - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider Query " << rName << " invalid"); return notPresent; } void SsiProviderServer::ResourceAdded(const char* rName) { // Handle resource based on its proposed type - - ResourceUnit ru(rName); - if (ru.unitType() == ResourceUnit::DBCHUNK) { - // Extract db and chunk from path and add the resource to the chunk - // inventory - _chunkInventory.add(ru.db(), ru.chunk()); - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider ResourceAdded " << rName); - return; - } - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider ResourceAdded " << rName << " invalid"); + return; } void SsiProviderServer::ResourceRemoved(const char* rName) { // Handle resource based on its proposed type - - ResourceUnit ru(rName); - if (ru.unitType() == ResourceUnit::DBCHUNK) { - // Extract db and chunk from path and add the resource to the chunk - // inventory - _chunkInventory.remove(ru.db(), ru.chunk()); - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider ResourceRemoved " << rName); - return; - } - LOGS(_log, LOG_LVL_DEBUG, "SsiProvider ResourceRemoved " << rName << " invalid"); + return; } } // namespace lsst::qserv::xrdsvc From 769affb2dcedd625e97a68497ce72af825b0c970 Mon Sep 17 00:00:00 2001 From: John Gates Date: Wed, 23 Oct 2024 12:18:21 -0700 Subject: [PATCH 13/22] Fixed dead worker check. --- src/cconfig/CzarConfig.h | 28 ++ src/ccontrol/MergingHandler.cc | 374 +-------------------------- src/ccontrol/MergingHandler.h | 9 - src/ccontrol/UserQueryAsyncResult.cc | 3 + src/ccontrol/UserQueryFactory.cc | 10 +- src/ccontrol/UserQueryFactory.h | 4 +- src/ccontrol/UserQuerySelect.cc | 20 +- src/ccontrol/UserQuerySelect.h | 10 +- src/czar/ActiveWorker.cc | 13 +- src/czar/ActiveWorker.h | 29 ++- src/czar/Czar.cc | 11 +- src/czar/Czar.h | 16 +- src/czar/CzarChunkMap.cc | 5 +- src/czar/CzarRegistry.cc | 4 +- src/czar/CzarRegistry.h | 2 +- src/global/ResourceUnit.cc | 2 - src/http/WorkerQueryStatusData.cc | 23 +- src/http/WorkerQueryStatusData.h | 20 +- src/qdisp/CMakeLists.txt | 3 +- src/qdisp/CzarStats.h | 4 +- src/qdisp/Executive.cc | 51 ++-- src/qdisp/Executive.h | 29 ++- src/qdisp/JobBase.cc | 54 ---- src/qdisp/JobBase.h | 76 ------ src/qdisp/JobQuery.cc | 29 ++- src/qdisp/JobQuery.h | 78 +++--- src/qdisp/ResponseHandler.h | 19 +- src/qdisp/SharedResources.h | 64 ----- src/qdisp/UberJob.cc | 62 +++-- src/qdisp/UberJob.h | 40 +-- src/qdisp/testQDisp.cc | 305 +++++++++++++--------- src/qmeta/QMetaMysql.cc | 9 +- src/util/Mutex.h | 6 +- src/util/QdispPool.cc | 268 +++++++++++++++++++ src/util/QdispPool.h | 209 +++++++++++++++ src/util/testMutex.cc | 18 +- src/wbase/UberJobData.cc | 1 - src/wconfig/WorkerConfig.h | 31 +++ src/wcontrol/Foreman.cc | 13 +- src/wcontrol/Foreman.h | 11 +- src/wcontrol/WCzarInfoMap.cc | 186 +++++++++++++ src/wcontrol/WCzarInfoMap.h | 129 +++++++++ src/wcontrol/WorkerStats.h | 2 +- src/wdb/CMakeLists.txt | 2 + src/wdb/testQueryRunner.cc | 80 +----- src/xrdsvc/SsiService.cc | 8 +- 46 files changed, 1331 insertions(+), 1039 deletions(-) delete mode 100644 src/qdisp/JobBase.cc delete mode 100644 src/qdisp/JobBase.h delete mode 100644 src/qdisp/SharedResources.h create mode 100644 src/util/QdispPool.cc create mode 100644 src/util/QdispPool.h create mode 100644 src/wcontrol/WCzarInfoMap.cc create mode 100644 src/wcontrol/WCzarInfoMap.h diff --git a/src/cconfig/CzarConfig.h b/src/cconfig/CzarConfig.h index 6fd1ed0da..e77878e18 100644 --- a/src/cconfig/CzarConfig.h +++ b/src/cconfig/CzarConfig.h @@ -199,6 +199,22 @@ class CzarConfig { /// the OOM situation. unsigned int czarStatsRetainPeriodSec() const { return _czarStatsRetainPeriodSec->getVal(); } + /// A worker is considered fully ALIVE if the last update from the worker has been + /// heard in less than _activeWorkerTimeoutAliveSecs seconds. + int getActiveWorkerTimeoutAliveSecs() const { return _activeWorkerTimeoutAliveSecs->getVal(); } + + /// A worker is considered DEAD if it hasn't been heard from in more than + /// _activeWorkerTimeoutDeadSecs. + int getActiveWorkerTimeoutDeadSecs() const { return _activeWorkerTimeoutDeadSecs->getVal(); } + + /// Max lifetime of a message to be sent to an active worker. If the czar has been + /// trying to send a message to a worker and has failed for this many seconds, + /// it gives up at this point, removing elements of the message to save memory. + int getActiveWorkerMaxLifetimeSecs() const { return _activeWorkerMaxLifetimeSecs->getVal(); } + + /// The maximum number of chunks (basically Jobs) allowed in a single UberJob. + int getUberJobMaxChunks() const { return _uberJobMaxChunks->getVal(); } + // Parameters of the Czar management service std::string const& replicationInstanceId() const { return _replicationInstanceId->getVal(); } @@ -386,6 +402,18 @@ class CzarConfig { util::ConfigValTInt::create(_configValMap, "replication", "http_port", notReq, 0); CVTUIntPtr _replicationNumHttpThreads = util::ConfigValTUInt::create(_configValMap, "replication", "num_http_threads", notReq, 2); + + // Active Worker + CVTIntPtr _activeWorkerTimeoutAliveSecs = // 5min + util::ConfigValTInt::create(_configValMap, "activeworker", "timeoutAliveSecs", notReq, 60 * 5); + CVTIntPtr _activeWorkerTimeoutDeadSecs = // 10min + util::ConfigValTInt::create(_configValMap, "activeworker", "timeoutDeadSecs", notReq, 60 * 10); + CVTIntPtr _activeWorkerMaxLifetimeSecs = // 1hr + util::ConfigValTInt::create(_configValMap, "activeworker", "maxLifetimeSecs", notReq, 60 * 60); + + // UberJobs + CVTIntPtr _uberJobMaxChunks = + util::ConfigValTInt::create(_configValMap, "uberjob", "maxChunks", notReq, 10); }; } // namespace lsst::qserv::cconfig diff --git a/src/ccontrol/MergingHandler.cc b/src/ccontrol/MergingHandler.cc index db79771f4..9a6ee5b5c 100644 --- a/src/ccontrol/MergingHandler.cc +++ b/src/ccontrol/MergingHandler.cc @@ -65,35 +65,6 @@ using namespace std; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.ccontrol.MergingHandler"); -string xrootdStatus2str(XrdCl::XRootDStatus const& s) { - return "status=" + to_string(s.status) + ", code=" + to_string(s.code) + ", errNo=" + to_string(s.errNo) + - ", message='" + s.GetErrorMessage() + "'"; -} - -/** - * Extract the file path (including both slashes) from the XROOTD-style URL. - * Input: - * @code - * "xroot://://"" - * @code - * Output: - * @code - * "//"" - * @code - */ -string xrootUrl2path(string const& xrootUrl) { - string const delim = "//"; - auto firstPos = xrootUrl.find(delim, 0); - if (string::npos != firstPos) { - // Resume serching at the first character following the delimiter. - auto secondPos = xrootUrl.find(delim, firstPos + 2); - if (string::npos != secondPos) { - return xrootUrl.substr(secondPos); - } - } - throw runtime_error("MergingHandler::" + string(__func__) + " illegal file resource url: " + xrootUrl); -} - /** * Instances of this class are used to update statistic counter on starting * and finishing operations with the result files. @@ -115,267 +86,6 @@ lsst::qserv::TimeCountTracker::CALLBACKFUNC const reportFileRecvRate = } }; -bool readXrootFileResourceAndMerge(string const& xrootUrl, - function const& messageIsReady) { - string const context = "MergingHandler::" + string(__func__) + " "; - - LOGS(_log, LOG_LVL_DEBUG, context << "xrootUrl=" << xrootUrl); - - // Track the file while the control flow is staying within the function. - ResultFileTracker const resultFileTracker; - - // The algorithm will read the input file to locate result objects containing rows - // and call the provided callback for each such row. - XrdCl::File file; - XrdCl::XRootDStatus status; - status = file.Open(xrootUrl, XrdCl::OpenFlags::Read); - if (!status.IsOK()) { - LOGS(_log, LOG_LVL_ERROR, - context << "failed to open " << xrootUrl << ", " << xrootdStatus2str(status)); - return false; - } - - // A value of the flag is set by the message processor when it's time to finish - // or abort reading the file. - bool last = false; - - // Temporary buffer for messages read from the file. The buffer will be (re-)allocated - // as needed to get the largest message. Note that a size of the messages won't exceed - // a limit set in ProtoHeaderWrap::PROTOBUFFER_HARD_LIMIT. - unique_ptr buf; - size_t bufSize = 0; - - uint64_t offset = 0; // A location of the next byte to be read from the input file. - bool success = true; - try { - while (!last) { - // This starts a timer of the data transmit rate tracker. - auto transmitRateTracker = make_unique>(reportFileRecvRate); - - // Read the frame header that carries a size of the subsequent message. - uint32_t msgSizeBytes = 0; - uint32_t bytesRead = 0; - status = file.Read(offset, sizeof(uint32_t), reinterpret_cast(&msgSizeBytes), bytesRead); - if (!status.IsOK()) { - throw runtime_error(context + "failed to read next frame header (" + - to_string(sizeof(uint32_t)) + " bytes) at offset " + to_string(offset) + - " from " + xrootUrl + ", " + xrootdStatus2str(status)); - } - offset += bytesRead; - - if (bytesRead == 0) break; - if (bytesRead != sizeof(uint32_t)) { - throw runtime_error(context + "read " + to_string(bytesRead) + " bytes instead of " + - to_string(sizeof(uint32_t)) + - " bytes when reading next frame header at offset " + - to_string(offset - bytesRead) + " from " + xrootUrl + ", " + - xrootdStatus2str(status)); - } - if (msgSizeBytes == 0) break; - if (msgSizeBytes > ProtoHeaderWrap::PROTOBUFFER_HARD_LIMIT) { - throw runtime_error(context + "message size of " + to_string(msgSizeBytes) + - " bytes at the frame header read at offset " + - to_string(offset - bytesRead) + " exceeds the hard limit set to " + - to_string(ProtoHeaderWrap::PROTOBUFFER_HARD_LIMIT) + " bytes, from " + - xrootUrl + ", " + xrootdStatus2str(status)); - } - - // (Re-)allocate the buffer if needed. - if (bufSize < msgSizeBytes) { - bufSize = msgSizeBytes; - buf.reset(new char[bufSize]); - } - - // Read the message. - size_t bytes2read = msgSizeBytes; - while (bytes2read != 0) { - uint32_t bytesRead = 0; - status = file.Read(offset, bytes2read, buf.get(), bytesRead); - if (!status.IsOK()) { - throw runtime_error(context + "failed to read " + to_string(bytes2read) + - " bytes at offset " + to_string(offset) + " from " + xrootUrl + ", " + - xrootdStatus2str(status)); - } - if (bytesRead == 0) { - throw runtime_error(context + "read 0 bytes instead of " + to_string(bytes2read) + - " bytes at offset " + to_string(offset) + " from " + xrootUrl + ", " + - xrootdStatus2str(status)); - } - offset += bytesRead; - bytes2read -= bytesRead; - } - - // Destroying the tracker will result in stopping the tracker's timer and - // reporting the file read rate before proceeding to the merge. - transmitRateTracker->addToValue(msgSizeBytes); - transmitRateTracker->setSuccess(); - transmitRateTracker.reset(); - - // Proceed to the result merge - success = messageIsReady(buf.get(), msgSizeBytes, last); - if (!success) break; - } - } catch (exception const& ex) { - LOGS(_log, LOG_LVL_ERROR, ex.what()); - success = false; - } - status = file.Close(); - if (!status.IsOK()) { - LOGS(_log, LOG_LVL_WARN, - context << "failed to close " << xrootUrl << ", " << xrootdStatus2str(status)); - } - - // Remove the file from the worker if it still exists. Report and ignore errors. - // The files will be garbage-collected by workers. - XrdCl::FileSystem fileSystem(xrootUrl); - status = fileSystem.Rm(xrootUrl2path(xrootUrl)); - if (!status.IsOK()) { - LOGS(_log, LOG_LVL_WARN, - context << "failed to remove " << xrootUrl << ", " << xrootdStatus2str(status)); - } - return success; -} - -bool readHttpFileAndMerge(string const& httpUrl, - function const& messageIsReady, - shared_ptr const& httpConnPool) { - string const context = "MergingHandler::" + string(__func__) + " "; - - LOGS(_log, LOG_LVL_DEBUG, context << "httpUrl=" << httpUrl); - - // Track the file while the control flow is staying within the function. - ResultFileTracker const resultFileTracker; - - // The data transmit rate tracker is set up before reading each data message. - unique_ptr> transmitRateTracker; - - // A location of the next byte to be read from the input file. The variable - // is used for error reporting. - uint64_t offset = 0; - - // Temporary buffer for messages read from the file. The buffer gets automatically - // resized to fit the largest message. - unique_ptr msgBuf; - size_t msgBufSize = 0; - size_t msgBufNext = 0; // An index of the next character in the buffer. - - // Fixed-size buffer to store the message size. - string msgSizeBuf(sizeof(uint32_t), '\0'); - size_t msgSizeBufNext = 0; // An index of the next character in the buffer. - - // The size of the next/current message. The variable is set after succesfully parsing - // the message length header and is reset back to 0 after parsing the message body. - // The value is stays 0 while reading the frame header. - uint32_t msgSizeBytes = 0; - bool success = true; - try { - string const noClientData; - vector const noClientHeaders; - http::ClientConfig clientConfig; - clientConfig.httpVersion = CURL_HTTP_VERSION_1_1; // same as in qhttp - clientConfig.bufferSize = CURL_MAX_READ_SIZE; // 10 MB in the current version of libcurl - clientConfig.tcpKeepAlive = true; - clientConfig.tcpKeepIdle = 5; // the default is 60 sec - clientConfig.tcpKeepIntvl = 5; // the default is 60 sec - http::Client reader(http::Method::GET, httpUrl, noClientData, noClientHeaders, clientConfig, - httpConnPool); - reader.read([&](char const* inBuf, size_t inBufSize) { - // A value of the flag is set by the message processor when it's time to finish - // or abort reading the file. - bool last = false; - char const* next = inBuf; - char const* const end = inBuf + inBufSize; - while ((next < end) && !last) { - if (msgSizeBytes == 0) { - // Continue or finish reading the frame header. - size_t const bytes2read = - std::min(sizeof(uint32_t) - msgSizeBufNext, (size_t)(end - next)); - std::memcpy(msgSizeBuf.data() + msgSizeBufNext, next, bytes2read); - next += bytes2read; - offset += bytes2read; - msgSizeBufNext += bytes2read; - if (msgSizeBufNext == sizeof(uint32_t)) { - // Done reading the frame header. - msgSizeBufNext = 0; - // Parse and evaluate the message length. - msgSizeBytes = *(reinterpret_cast(msgSizeBuf.data())); - if (msgSizeBytes == 0) { - throw runtime_error(context + "message size is 0 at offset " + - to_string(offset - sizeof(uint32_t)) + ", file: " + httpUrl); - } - if (msgSizeBytes > ProtoHeaderWrap::PROTOBUFFER_HARD_LIMIT) { - throw runtime_error(context + "message size " + to_string(msgSizeBytes) + - " at offset " + to_string(offset - sizeof(uint32_t)) + - " exceeds the hard limit of " + - to_string(ProtoHeaderWrap::PROTOBUFFER_HARD_LIMIT) + - ", file: " + httpUrl); - } - // Extend the message buffer (if needed). Note that buffer never gets - // truncated to avoid excessive memory deallocations/allocations. - if (msgBufSize < msgSizeBytes) { - msgBufSize = msgSizeBytes; - msgBuf.reset(new char[msgBufSize]); - } - // Starts the tracker to measure the performance of the network I/O. - transmitRateTracker = - make_unique>(reportFileRecvRate); - } - } else { - // Continue or finish reading the message body. - size_t const bytes2read = - std::min((size_t)msgSizeBytes - msgBufNext, (size_t)(end - next)); - std::memcpy(msgBuf.get() + msgBufNext, next, bytes2read); - next += bytes2read; - offset += bytes2read; - msgBufNext += bytes2read; - if (msgBufNext == msgSizeBytes) { - // Done reading message body. - msgBufNext = 0; - - // Destroying the tracker will result in stopping the tracker's timer and - // reporting the file read rate before proceeding to the merge. - if (transmitRateTracker != nullptr) { - transmitRateTracker->addToValue(msgSizeBytes); - transmitRateTracker->setSuccess(); - transmitRateTracker.reset(); - } - - // Parse and evaluate the message. - bool const success = messageIsReady(msgBuf.get(), msgSizeBytes, last); - if (!success) { - throw runtime_error(context + "message processing failed at offset " + - to_string(offset - msgSizeBytes) + ", file: " + httpUrl); - } - // Reset the variable to prepare for reading the next header & message (if any). - msgSizeBytes = 0; - } - } - } - }); - if (msgSizeBufNext != 0) { - throw runtime_error(context + "short read of the message header at offset " + - to_string(offset - msgSizeBytes) + ", file: " + httpUrl); - } - if (msgBufNext != 0) { - throw runtime_error(context + "short read of the message body at offset " + - to_string(offset - msgSizeBytes) + ", file: " + httpUrl); - } - } catch (exception const& ex) { - LOGS(_log, LOG_LVL_ERROR, string(__func__) + " " + ex.what()); - success = false; - } - - // Remove the file from the worker if it still exists. Report and ignore errors. - // The files will be garbage-collected by workers. - try { - http::Client remover(http::Method::DELETE, httpUrl); - remover.read([](char const* inBuf, size_t inBufSize) {}); - } catch (exception const& ex) { - LOGS(_log, LOG_LVL_WARN, context << "failed to remove " << httpUrl << ", ex: " << ex.what()); - } - return success; -} - std::tuple readHttpFileAndMergeHttp( lsst::qserv::qdisp::UberJob::Ptr const& uberJob, string const& httpUrl, function const& messageIsReady, @@ -559,69 +269,6 @@ MergingHandler::MergingHandler(std::shared_ptr merger, std: MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_DEBUG, __func__ << " " << _tableName); } -bool MergingHandler::flush(proto::ResponseSummary const& responseSummary, uint32_t& resultRows) { - _wName = responseSummary.wname(); - - // This is needed to ensure the job query would be staying alive for the duration - // of the operation to prevent inconsistency witin the application. - auto const jobBase = getJobBase().lock(); - if (jobBase == nullptr) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " failed, jobBase was NULL"); - return false; - } - auto const jobQuery = std::dynamic_pointer_cast(jobBase); - - LOGS(_log, LOG_LVL_TRACE, - "MergingHandler::" << __func__ << " jobid=" << responseSummary.jobid() - << " transmitsize=" << responseSummary.transmitsize() - << " rowcount=" << responseSummary.rowcount() << " rowSize=" - << " attemptcount=" << responseSummary.attemptcount() << " errorcode=" - << responseSummary.errorcode() << " errormsg=" << responseSummary.errormsg()); - - if (responseSummary.errorcode() != 0 || !responseSummary.errormsg().empty()) { - _error = util::Error(responseSummary.errorcode(), responseSummary.errormsg(), - util::ErrorCode::MYSQLEXEC); - _setError(ccontrol::MSG_RESULT_ERROR, _error.getMsg()); - LOGS(_log, LOG_LVL_ERROR, - "MergingHandler::" << __func__ << " error from worker:" << responseSummary.wname() - << " error: " << _error); - return false; - } - - // Dispatch result processing to the corresponidng method which depends on - // the result delivery protocol configured at the worker. - // Notify the file reader when all rows have been read by setting 'last = true'. - auto const dataMerger = [&](char const* buf, uint32_t size, bool& last) { - last = true; - proto::ResponseData responseData; - if (responseData.ParseFromArray(buf, size) && responseData.IsInitialized()) { - bool const success = _merge(responseSummary, responseData, jobQuery); - if (success) { - resultRows += responseData.row_size(); - last = resultRows >= responseSummary.rowcount(); - } - return success; - } - throw runtime_error("MergingHandler::flush ** message deserialization failed **"); - }; - - bool success = false; - if (!responseSummary.fileresource_xroot().empty()) { - success = ::readXrootFileResourceAndMerge(responseSummary.fileresource_xroot(), dataMerger); - } else if (!responseSummary.fileresource_http().empty()) { - success = ::readHttpFileAndMerge(responseSummary.fileresource_http(), dataMerger, - MergingHandler::_getHttpConnPool()); - } else { - string const err = "Unexpected result delivery protocol"; - LOGS(_log, LOG_LVL_ERROR, __func__ << " " << err); - throw util::Bug(ERR_LOC, err); - } - if (success) { - _infileMerger->mergeCompleteFor(responseSummary.jobid()); - } - return success; -} - void MergingHandler::errorFlush(std::string const& msg, int code) { _setError(code, msg); // Might want more info from result service. @@ -629,20 +276,6 @@ void MergingHandler::errorFlush(std::string const& msg, int code) { LOGS(_log, LOG_LVL_ERROR, "Error receiving result."); } -bool MergingHandler::finished() const { return _flushed; } - -bool MergingHandler::reset() { - // If we've pushed any bits to the merger successfully, we have to undo them - // to reset to a fresh state. For now, we will just fail if we've already - // begun merging. If we implement the ability to retract a partial result - // merge, then we can use it and do something better. - if (_flushed) { - return false; // Can't reset if we have already pushed state. - } - _initState(); - return true; -} - // Note that generally we always have an _infileMerger object except during // a unit test. I suppose we could try to figure out how to create one. // @@ -698,12 +331,11 @@ tuple MergingHandler::flushHttp(string const& fileUrl, uint64_t expe // This is needed to ensure the job query would be staying alive for the duration // of the operation to prevent inconsistency within the application. - auto const jobBase = getJobBase().lock(); - if (jobBase == nullptr) { - LOGS(_log, LOG_LVL_ERROR, __func__ << " failed, jobBase was NULL"); + auto const uberJob = getUberJob().lock(); + if (uberJob == nullptr) { + LOGS(_log, LOG_LVL_ERROR, __func__ << " failed, uberJob was NULL"); return {success, shouldCancel}; // both should still be false } - auto const uberJob = std::dynamic_pointer_cast(jobBase); LOGS(_log, LOG_LVL_TRACE, "MergingHandler::" << __func__ << " uberJob=" << uberJob->getIdStr() << " fileUrl=" << fileUrl); diff --git a/src/ccontrol/MergingHandler.h b/src/ccontrol/MergingHandler.h index 1152dc932..aa4e06dd0 100644 --- a/src/ccontrol/MergingHandler.h +++ b/src/ccontrol/MergingHandler.h @@ -71,10 +71,6 @@ class MergingHandler : public qdisp::ResponseHandler { /// @param tableName target table for incoming data MergingHandler(std::shared_ptr merger, std::string const& tableName); - /// Process the response and read the result file if no error was reported by a worker. - /// @return true if successful (no error) - bool flush(proto::ResponseSummary const& responseSummary, uint32_t& resultRows) override; - /// @see ResponseHandler::flushHttp /// @see MerginHandler::_mergeHttp std::tuple flushHttp(std::string const& fileUrl, uint64_t expectedRows, @@ -86,11 +82,6 @@ class MergingHandler : public qdisp::ResponseHandler { /// Signal an unrecoverable error condition. No further calls are expected. void errorFlush(std::string const& msg, int code) override; - /// @return true if the receiver has completed its duties. - bool finished() const override; - - bool reset() override; ///< Reset the state that a request can be retried. - /// Print a string representation of the receiver to an ostream std::ostream& print(std::ostream& os) const override; diff --git a/src/ccontrol/UserQueryAsyncResult.cc b/src/ccontrol/UserQueryAsyncResult.cc index a3edbbcc2..beb1089a7 100644 --- a/src/ccontrol/UserQueryAsyncResult.cc +++ b/src/ccontrol/UserQueryAsyncResult.cc @@ -85,6 +85,8 @@ void UserQueryAsyncResult::submit() { // if there are messages already it means the error was detected, stop right here if (_messageStore->messageCount() > 0) { + LOGS(_log, LOG_LVL_WARN, + "UserQueryAsyncResult::submit giving up, messageCount=" << _messageStore->messageCount()); return; } @@ -92,6 +94,7 @@ void UserQueryAsyncResult::submit() { if (_qInfo.czarId() != _qMetaCzarId) { // TODO: tell user which czar was it? std::string message = "Query originated from different czar"; + LOGS(_log, LOG_LVL_WARN, "UserQueryAsyncResult::submit giving up, message=" << message); _messageStore->addErrorMessage("SYSTEM", message); return; } diff --git a/src/ccontrol/UserQueryFactory.cc b/src/ccontrol/UserQueryFactory.cc index 85ba8a7dc..79b778b37 100644 --- a/src/ccontrol/UserQueryFactory.cc +++ b/src/ccontrol/UserQueryFactory.cc @@ -68,6 +68,7 @@ #include "rproc/InfileMerger.h" #include "sql/SqlConnection.h" #include "sql/SqlConnectionFactory.h" +#include "util/QdispPool.h" namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.ccontrol.UserQueryFactory"); @@ -215,7 +216,7 @@ UserQueryFactory::~UserQueryFactory() { } UserQuery::Ptr UserQueryFactory::newUserQuery(std::string const& aQuery, std::string const& defaultDb, - qdisp::SharedResources::Ptr const& qdispSharedResources, + util::QdispPool::Ptr const& qdispPool, std::string const& userQueryId, std::string const& msgTableName, std::string const& resultDb) { // result location could potentially be specified by SUBMIT command, for now @@ -308,20 +309,23 @@ UserQuery::Ptr UserQueryFactory::newUserQuery(std::string const& aQuery, std::st std::shared_ptr infileMergerConfig; if (sessionValid) { executive = - qdisp::Executive::create(*_executiveConfig, messageStore, qdispSharedResources, + qdisp::Executive::create(*_executiveConfig, messageStore, qdispPool, _userQuerySharedResources->queryStatsData, qs, _asioIoService); infileMergerConfig = std::make_shared(_userQuerySharedResources->mysqlResultConfig); infileMergerConfig->debugNoMerge = _debugNoMerge; } + auto czarConfig = cconfig::CzarConfig::instance(); + int uberJobMaxChunks = czarConfig->getUberJobMaxChunks(); + // This, effectively invalid, UserQuerySelect object should report errors from both `errorExtra` // and errors that the QuerySession `qs` has stored internally. auto uq = std::make_shared( qs, messageStore, executive, _userQuerySharedResources->databaseModels, infileMergerConfig, _userQuerySharedResources->secondaryIndex, _userQuerySharedResources->queryMetadata, _userQuerySharedResources->queryStatsData, _userQuerySharedResources->semaMgrConnections, - _userQuerySharedResources->qMetaCzarId, errorExtra, async, resultDb); + _userQuerySharedResources->qMetaCzarId, errorExtra, async, resultDb, uberJobMaxChunks); if (sessionValid) { uq->qMetaRegister(resultLocation, msgTableName); uq->setupMerger(); diff --git a/src/ccontrol/UserQueryFactory.h b/src/ccontrol/UserQueryFactory.h index a467ea07a..90e510979 100644 --- a/src/ccontrol/UserQueryFactory.h +++ b/src/ccontrol/UserQueryFactory.h @@ -42,7 +42,7 @@ // Local headers #include "global/stringTypes.h" -#include "qdisp/SharedResources.h" +#include "util/QdispPool.h" namespace lsst::qserv::ccontrol { class UserQuery; @@ -82,7 +82,7 @@ class UserQueryFactory : private boost::noncopyable { /// @param msgTableName: Name of the message table without database name. /// @return new UserQuery object std::shared_ptr newUserQuery(std::string const& query, std::string const& defaultDb, - qdisp::SharedResources::Ptr const& qdispSharedResources, + std::shared_ptr const& qdispPool, std::string const& userQueryId, std::string const& msgTableName, std::string const& resultDb); diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index a22eb7b88..41c60ec76 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -132,7 +132,8 @@ UserQuerySelect::UserQuerySelect(std::shared_ptr const& qs, std::shared_ptr const& queryMetadata, std::shared_ptr const& queryStatsData, std::shared_ptr const& semaMgrConn, qmeta::CzarId czarId, - std::string const& errorExtra, bool async, std::string const& resultDb) + std::string const& errorExtra, bool async, std::string const& resultDb, + int uberJobMaxChunks) : _qSession(qs), _messageStore(messageStore), _executive(executive), @@ -145,7 +146,8 @@ UserQuerySelect::UserQuerySelect(std::shared_ptr const& qs, _qMetaCzarId(czarId), _errorExtra(errorExtra), _resultDb(resultDb), - _async(async) {} + _async(async), + _uberJobMaxChunks(uberJobMaxChunks) {} std::string UserQuerySelect::getError() const { std::string div = (_errorExtra.size() && _qSession->getError().size()) ? " " : ""; @@ -303,7 +305,6 @@ void UserQuerySelect::submit() { } /// At this point the executive has a map of all jobs with the chunkIds as the key. - _maxChunksPerUberJob = 2; // &&& set in config // This is needed to prevent Czar::_monitor from starting things before they are ready. _executive->setReadyToExecute(); buildAndSendUberJobs(); @@ -320,6 +321,7 @@ void UserQuerySelect::submit() { } void UserQuerySelect::buildAndSendUberJobs() { + // TODO:UJ Is special handling needed for the dummy chunk, 1234567890 ? string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); LOGS(_log, LOG_LVL_DEBUG, funcN << " start"); @@ -428,7 +430,7 @@ void UserQuerySelect::buildAndSendUberJobs() { // Add this job to the appropriate UberJob, making the UberJob if needed. string workerId = targetWorker->getWorkerId(); auto& ujVect = workerJobMap[workerId]; - if (ujVect.empty() || ujVect.back()->getJobCount() >= _maxChunksPerUberJob) { + if (ujVect.empty() || ujVect.back()->getJobCount() >= _uberJobMaxChunks) { auto ujId = _uberJobIdSeq++; // keep ujId consistent string uberResultName = _ttn->make(ujId); auto respHandler = make_shared(_infileMerger, uberResultName); @@ -455,7 +457,6 @@ void UserQuerySelect::buildAndSendUberJobs() { // Add worker contact info to UberJobs. The czar can't do anything without // the contact map, so it will wait. This should only ever be an issue at startup. auto const wContactMap = czRegistry->waitForWorkerContactMap(); - LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); for (auto const& [wIdKey, ujVect] : workerJobMap) { auto iter = wContactMap->find(wIdKey); if (iter == wContactMap->end()) { @@ -470,9 +471,10 @@ void UserQuerySelect::buildAndSendUberJobs() { } _executive->addUberJobs(ujVect); for (auto const& ujPtr : ujVect) { - _executive->runUberJob(ujPtr); + _executive->queueUberJob(ujPtr); } } + LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); } /// Block until a submit()'ed query completes. @@ -510,14 +512,14 @@ QueryState UserQuerySelect::join() { QueryState state = SUCCESS; if (successful) { _qMetaUpdateStatus(qmeta::QInfo::COMPLETED, collectedRows, collectedBytes, finalRows); - LOGS(_log, LOG_LVL_INFO, "Joined everything (success)"); + LOGS(_log, LOG_LVL_INFO, "Joined everything (success) QID=" << getQueryId()); } else if (_killed) { // status is already set to ABORTED - LOGS(_log, LOG_LVL_ERROR, "Joined everything (killed)"); + LOGS(_log, LOG_LVL_ERROR, "Joined everything (killed) QID=" << getQueryId()); state = ERROR; } else { _qMetaUpdateStatus(qmeta::QInfo::FAILED, collectedRows, collectedBytes, finalRows); - LOGS(_log, LOG_LVL_ERROR, "Joined everything (failure!)"); + LOGS(_log, LOG_LVL_ERROR, "Joined everything (failure!) QID=" << getQueryId()); state = ERROR; } auto const czarConfig = cconfig::CzarConfig::instance(); diff --git a/src/ccontrol/UserQuerySelect.h b/src/ccontrol/UserQuerySelect.h index a01b973cd..08e22a6c0 100644 --- a/src/ccontrol/UserQuerySelect.h +++ b/src/ccontrol/UserQuerySelect.h @@ -42,7 +42,6 @@ // Qserv headers #include "ccontrol/UserQuery.h" #include "css/StripingParams.h" -#include "qdisp/SharedResources.h" #include "qmeta/QInfo.h" #include "qmeta/QStatus.h" #include "qmeta/types.h" @@ -95,7 +94,8 @@ class UserQuerySelect : public UserQuery { std::shared_ptr const& queryMetadata, std::shared_ptr const& queryStatsData, std::shared_ptr const& semaMgrConn, qmeta::CzarId czarId, - std::string const& errorExtra, bool async, std::string const& resultDb); + std::string const& errorExtra, bool async, std::string const& resultDb, + int uberJobMaxChunks); UserQuerySelect(UserQuerySelect const&) = delete; UserQuerySelect& operator=(UserQuerySelect const&) = delete; @@ -199,10 +199,8 @@ class UserQuerySelect : public UserQuery { std::string _resultDb; ///< Result database TODO:UJ same as resultLoc??) bool _async; ///< true for async query - /// TODO:UJ The maximum number of chunks allowed in an UberJob. At the very - /// least, this needs to be set in the configuration. However, it may also - /// be useful to change this based on the nature of each UserQuery. - int _maxChunksPerUberJob = 1; + /// The maximum number of chunks allowed in an UberJob, set from config. + int const _uberJobMaxChunks; std::atomic _uberJobIdSeq{1}; ///< Sequence number for UberJobs in this query. std::shared_ptr _ttn; ///< Temporary table name generator. diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index ef6302767..e2e356fdd 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -80,6 +80,7 @@ void ActiveWorker::_changeStateTo(State newState, double secsSinceUpdate, string void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double timeoutDeadSecs, double maxLifetime) { + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " start"); bool newlyDeadWorker = false; http::WorkerContactInfo::Ptr wInfo_; { @@ -89,9 +90,14 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " no WorkerContactInfo"); return; } - double secsSinceUpdate = (wInfo_ == nullptr) ? timeoutDeadSecs : wInfo_->timeSinceRegUpdateSeconds(); + double secsSinceUpdate = wInfo_->timeSinceRegUpdateSeconds(); + LOGS(_log, LOG_LVL_TRACE, + cName(__func__) << " wInfo=" << wInfo_->dump() + << " secsSince=" << wInfo_->timeSinceRegUpdateSeconds() + << " secsSinceUpdate=" << secsSinceUpdate); // Update the last time the registry contacted this worker. + // TODO:UJ - This needs to be added to the dashboard. switch (_state) { case ALIVE: { if (secsSinceUpdate >= timeoutAliveSecs) { @@ -230,6 +236,11 @@ string ActiveWorker::_dump() const { return os.str(); } +ActiveWorkerMap::ActiveWorkerMap(std::shared_ptr const& czarConfig) + : _timeoutAliveSecs(czarConfig->getActiveWorkerTimeoutAliveSecs()), + _timeoutDeadSecs(czarConfig->getActiveWorkerTimeoutDeadSecs()), + _maxLifetime(czarConfig->getActiveWorkerMaxLifetimeSecs()) {} + void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index b2e1f8c6c..3c4c16c59 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -35,6 +35,10 @@ #include "http/WorkerQueryStatusData.h" #include "util/Bug.h" +namespace lsst::qserv::cconfig { +class CzarConfig; +} + // This header declarations namespace lsst::qserv::czar { @@ -44,7 +48,7 @@ namespace lsst::qserv::czar { /// have finished or need to be cancelled. /// - maintain list of done/cancelled queries for an active worker, and send /// that list to the worker. Once the worker has accepted the list, remove -/// all of those queryId's from the list. +/// all of those queryId's from the lists. /// - maintain a list of killed UberJobs. If an UberJob is killed, nothing /// will every look for its files, so they should be deleted, and the /// worker should avoid working on Tasks for that UberJob. @@ -57,9 +61,8 @@ namespace lsst::qserv::czar { /// them. If the worker isn't told, it will continue working on /// the UberJob until it finishes, and then find out the UberJob was killed /// when it tries to return results to the czar. The worker should delete -/// files for said UberJob at that point). -/// So, this should be very rare, only results in extra load, and therefore -/// is a low priority. +/// files for said UberJob at that point. +/// So, this should be very rare, only results in extra load. /// /// If a worker goes missing from the registry, it is considered DEAD and may be /// removed after a period of time. @@ -69,7 +72,7 @@ namespace lsst::qserv::czar { /// /// When a worker becomes DEAD: (see Czar::_monitor). /// - Affected UberJobs are killed. -/// - UberJobs are built to handle unassigned jobs where dead workers are skipped and +/// - New UberJobs are built to handle unassigned jobs where dead workers are skipped and /// the jobs are assigned to alternate workers. /// class ActiveWorker : public std::enable_shared_from_this { @@ -186,6 +189,9 @@ class ActiveWorkerMap { ActiveWorkerMap() = default; ActiveWorkerMap(ActiveWorkerMap const&) = delete; ActiveWorkerMap operator=(ActiveWorkerMap const&) = delete; + + ActiveWorkerMap(std::shared_ptr const& czarConfig); + ~ActiveWorkerMap() = default; std::string cName(const char* fName) { return std::string("ActiveWorkerMap::") + fName + " "; } @@ -214,7 +220,7 @@ class ActiveWorkerMap { /// Add `qId` to the list of query ids where the worker must hold onto result /// files but all incomplete Tasks can be stopped. This is used for `rowLimitComplete` /// where enough rows have been found to complete a user query with a LIMIT - ///clause. The czar may still need to collect the result files from the worker. + /// clause. The czar may still need to collect the result files from the worker. /// Once the czar has completed the user query, the `qId` will be added to /// `addToDoneDeleteFiles` so the workers will delete the files. void addToDoneKeepFiles(QueryId qId); @@ -223,9 +229,14 @@ class ActiveWorkerMap { std::map _awMap; ///< Key is worker id. mutable std::mutex _awMapMtx; ///< protects _awMap; - double _timeoutAliveSecs = 60.0 * 5.0; ///< &&& set from config. 5min - double _timeoutDeadSecs = 60.0 * 10.0; ///< &&& set from config. 10min - double _maxLifetime = 60.0 * 60.0; ///< &&& set from config. 1hr + /// @see CzarConfig::getActiveWorkerTimeoutAliveSecs() + double _timeoutAliveSecs = 60.0 * 5.0; + + /// @see CzarConfig::getActiveWorkerTimeoutDeadSecs() + double _timeoutDeadSecs = 60.0 * 10.0; + + /// @see CzarConfig::getActiveWorkerMaxLifetimeSecs() + double _maxLifetime = 60.0 * 60.0; bool _czarCancelAfterRestart = false; CzarIdType _czarCancelAfterRestartCzId = 0; diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index c05400ab7..db70bcbfe 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -56,7 +56,6 @@ #include "proto/worker.pb.h" #include "qdisp/CzarStats.h" #include "qdisp/Executive.h" -#include "qdisp/SharedResources.h" #include "qproc/DatabaseModels.h" #include "rproc/InfileMerger.h" #include "sql/SqlConnection.h" @@ -152,7 +151,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) _idCounter(), _uqFactory(), _clientToQuery(), - _activeWorkerMap(new ActiveWorkerMap()) { + _activeWorkerMap(new ActiveWorkerMap(_czarConfig)) { // set id counter to milliseconds since the epoch, mod 1 year. struct timeval tv; gettimeofday(&tv, nullptr); @@ -201,9 +200,6 @@ Czar::Czar(string const& configFilePath, string const& czarName) _qdispPool = make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); qdisp::CzarStats::setup(_qdispPool); - - _qdispSharedResources = qdisp::SharedResources::create(_qdispPool); - int xrootdCBThreadsMax = _czarConfig->getXrootdCBThreadsMax(); int xrootdCBThreadsInit = _czarConfig->getXrootdCBThreadsInit(); LOGS(_log, LOG_LVL_INFO, "config xrootdCBThreadsMax=" << xrootdCBThreadsMax); @@ -294,8 +290,7 @@ SubmitResult Czar::submitQuery(string const& query, map const& h ccontrol::UserQuery::Ptr uq; { lock_guard lock(_mutex); - uq = _uqFactory->newUserQuery(query, defaultDb, getQdispSharedResources(), userQueryId, msgTableName, - resultDb); + uq = _uqFactory->newUserQuery(query, defaultDb, getQdispPool(), userQueryId, msgTableName, resultDb); } // Add logging context with query ID @@ -714,7 +709,7 @@ void Czar::killIncompleteUbjerJobsOn(std::string const& restartedWorkerId) { for (auto const& [eKey, wPtrExec] : execMap) { auto exec = wPtrExec.lock(); if (exec != nullptr) { - exec->killIncompleteUberJobsOn(restartedWorkerId); + exec->killIncompleteUberJobsOnWorker(restartedWorkerId); } } } diff --git a/src/czar/Czar.h b/src/czar/Czar.h index 6574f33b6..78b02237a 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -42,7 +42,6 @@ #include "global/intTypes.h" #include "global/stringTypes.h" #include "mysql/MySqlConfig.h" -#include "qdisp/SharedResources.h" #include "util/ConfigStore.h" #include "util/Timer.h" @@ -121,9 +120,6 @@ class Czar { */ static Ptr getCzar() { return _czar; } - /// Return a pointer to QdispSharedResources - qdisp::SharedResources::Ptr getQdispSharedResources() { return _qdispSharedResources; } - /// Remove all old tables in the qservResult database. void removeOldResultTables(); @@ -201,11 +197,6 @@ class Czar { IdToQuery _idToQuery; ///< maps query ID to query (for currently running queries) std::mutex _mutex; ///< protects _uqFactory, _clientToQuery, and _idToQuery - /// Thread pool for handling Responses from XrdSsi, - /// the PsuedoFifo to prevent czar from calling most recent requests, - /// and any other resources for use by query executives. - qdisp::SharedResources::Ptr _qdispSharedResources; - util::Timer _lastRemovedTimer; ///< Timer to limit table deletions. std::mutex _lastRemovedMtx; ///< protects _lastRemovedTimer @@ -245,6 +236,13 @@ class Czar { /// A combined priority queue and thread pool to regulate czar communications /// with workers. Once created, the pointer never changes. + /// TODO:UJ - It would be better to have a pool for each worker as it + /// may be possible for a worker to have communications + /// problems in a way that would wedge the pool. This can + /// probably be done fairly easily by having pools + /// attached to ActiveWorker in _activeWorkerMap. + /// This was not possible in xrootd as the czar had + /// no reasonable way to know where Jobs were going. std::shared_ptr _qdispPool; }; diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 58c675262..c064f60d1 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -333,11 +333,14 @@ bool CzarFamilyMap::_read() { // better to wait for new maps if something changed. std::lock_guard gLock(_familyMapMtx); qmeta::QMetaChunkMap qChunkMap = _qmeta->getChunkMap(_lastUpdateTime); - if (_lastUpdateTime >= qChunkMap.updateTime) { + if (_lastUpdateTime == qChunkMap.updateTime) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " no need to read " << util::TimeUtils::timePointToDateTimeString(_lastUpdateTime) << " db=" << util::TimeUtils::timePointToDateTimeString(qChunkMap.updateTime)); + // &&& Should a flag be set here to alter worker aliveness check as nothing has changed? TODO:UJ + // &&& Reason being that a brief loss of the registry could leave all workers marked as dead, when + // &&& they are still alive. return false; } diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index 432dfb2aa..b1bbe7974 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -131,13 +131,13 @@ void CzarRegistry::_registryWorkerInfoLoop() { // TODO: Is there a better thing to do than just log this here? } else { http::WorkerContactInfo::WCMapPtr wMap = _buildMapFromJson(response); - // Compare the new map to the existing map and replace if different. + // Update the values in the map { auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), util::get_current_host_fqdn(), czarStartTime); lock_guard lck(_cmapMtx); - if (wMap != nullptr && !_compareMapContactInfo(*wMap)) { + if (wMap != nullptr) { _contactMap = wMap; _latestMapUpdate = CLOCK::now(); _activeWorkerMap->updateMap(*_contactMap, czInfo, replicationInstanceId, diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index aef90ea44..b7233f15d 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -120,7 +120,7 @@ class CzarRegistry { http::WorkerContactInfo::WCMapPtr _contactMap; TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to ///< WorkerContactInfo update. - mutable MUTEX _cmapMtx; /// Protects _contactMap, _latestUpdate + mutable MUTEX _cmapMtx; /// Protects _contactMap, _latestUpdate /// Map for tracking worker aliveness, it has its own internal mutex. std::shared_ptr const _activeWorkerMap; diff --git a/src/global/ResourceUnit.cc b/src/global/ResourceUnit.cc index 816d469ff..3bbe5372b 100644 --- a/src/global/ResourceUnit.cc +++ b/src/global/ResourceUnit.cc @@ -53,7 +53,6 @@ std::string ResourceUnit::path() const { return ss.str(); } - std::string ResourceUnit::prefix(UnitType const& r) { switch (r) { case DBCHUNK: @@ -72,7 +71,6 @@ std::string ResourceUnit::makePath(int chunk, std::string const& db) { return _pathSep + prefix(UnitType::DBCHUNK) + _pathSep + db + _pathSep + std::to_string(chunk); } - void ResourceUnit::setAsDbChunk(std::string const& db, int chunk) { _unitType = DBCHUNK; _db = db; diff --git a/src/http/WorkerQueryStatusData.cc b/src/http/WorkerQueryStatusData.cc index 2188920b5..8f4ac38f2 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/http/WorkerQueryStatusData.cc @@ -30,6 +30,7 @@ #include "http/MetaModule.h" #include "http/RequestBodyJSON.h" #include "util/common.h" +#include "util/TimeUtils.h" // LSST headers #include "lsst/log/Log.h" @@ -120,6 +121,12 @@ WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonWorker(nlohmann::json co return nullptr; } +void WorkerContactInfo::setRegUpdateTime(TIMEPOINT updateTime) { + std::lock_guard lg(_rMtx); + _regUpdateTime = updateTime; + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " " << _dump()); +} + string WorkerContactInfo::dump() const { lock_guard lg(_rMtx); return _dump(); @@ -128,7 +135,8 @@ string WorkerContactInfo::dump() const { string WorkerContactInfo::_dump() const { stringstream os; os << "workerContactInfo{" - << "id=" << wId << " host=" << _wHost << " mgHost=" << _wManagementHost << " port=" << _wPort << "}"; + << "id=" << wId << " host=" << _wHost << " mgHost=" << _wManagementHost << " port=" << _wPort + << " update=" << util::TimeUtils::timePointToDateTimeString(_regUpdateTime) << "}"; return os.str(); } @@ -324,6 +332,19 @@ void WorkerQueryStatusData::addDeadUberJobs(QueryId qId, std::vector } } +void WorkerQueryStatusData::setWInfo(WorkerContactInfo::Ptr const& wInfo_) { + std::lock_guard lgI(_infoMtx); + if (_wInfo == nullptr) { + _wInfo = wInfo_; + return; + } + if (wInfo_ != nullptr) { + // This only changes host and port values of _wInfo. + _wInfo->changeBaseInfo(*wInfo_); + } + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " " << _wInfo->dump()); +} + void WorkerQueryStatusData::addDeadUberJob(QueryId qId, UberJobId ujId, TIMEPOINT tm) { lock_guard mapLg(mapMtx); auto& ujMap = qIdDeadUberJobs[qId]; diff --git a/src/http/WorkerQueryStatusData.h b/src/http/WorkerQueryStatusData.h index 79d1e04f2..dbf961ec8 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/http/WorkerQueryStatusData.h @@ -150,10 +150,7 @@ class WorkerContactInfo { return (wId == oWId && _wHost == oWHost && _wManagementHost == oWManagementHost && _wPort == oWPort); } - void setRegUpdateTime(TIMEPOINT updateTime) { - std::lock_guard lg(_rMtx); - _regUpdateTime = updateTime; - } + void setRegUpdateTime(TIMEPOINT updateTime); TIMEPOINT getRegUpdateTime(TIMEPOINT updateTime) { std::lock_guard lg(_rMtx); @@ -252,17 +249,7 @@ class WorkerQueryStatusData { ~WorkerQueryStatusData() = default; - void setWInfo(WorkerContactInfo::Ptr const& wInfo_) { - std::lock_guard lgI(_infoMtx); - if (_wInfo == nullptr) { - _wInfo = wInfo_; - return; - } - if (wInfo_ != nullptr) { - // This only changes host and port values of _wInfo. - _wInfo->changeBaseInfo(*wInfo_); - } - } + void setWInfo(WorkerContactInfo::Ptr const& wInfo_); WorkerContactInfo::Ptr getWInfo() const { std::lock_guard lgI(_infoMtx); @@ -378,7 +365,7 @@ class WorkerQueryStatusData { WorkerContactInfo::Ptr _wInfo; ///< Information needed to contact the worker. CzarContactInfo::Ptr const _czInfo; ///< Information needed to contact the czar. - mutable MUTEX _infoMtx; ///< protects _wInfo + mutable MUTEX _infoMtx; ///< protects _wInfo std::string const _replicationInstanceId; ///< Used for message verification. std::string const _replicationAuthKey; ///< Used for message verification. @@ -387,7 +374,6 @@ class WorkerQueryStatusData { std::string _dump() const; }; - /// This class is used to send/receive a message from the worker to a specific /// czar when there has been a communication issue with the worker sending UberJob /// file ready messages. If there have been timeouts, the worker will send this diff --git a/src/qdisp/CMakeLists.txt b/src/qdisp/CMakeLists.txt index fc3193ba4..38cae1ec1 100644 --- a/src/qdisp/CMakeLists.txt +++ b/src/qdisp/CMakeLists.txt @@ -5,7 +5,6 @@ target_sources(qdisp PRIVATE ChunkMeta.cc CzarStats.cc Executive.cc - JobBase.cc JobDescription.cc JobQuery.cc UberJob.cc @@ -44,6 +43,6 @@ target_link_libraries(testQDisp ) # This is failing in github actions CI but not when running locally on my dev machine. -# add_test(NAME testQDisp COMMAND testQDisp) +add_test(NAME testQDisp COMMAND testQDisp) # set_tests_properties(testQDisp PROPERTIES WILL_FAIL 1) diff --git a/src/qdisp/CzarStats.h b/src/qdisp/CzarStats.h index 123654ece..aaa40bf9b 100644 --- a/src/qdisp/CzarStats.h +++ b/src/qdisp/CzarStats.h @@ -210,8 +210,8 @@ class CzarStats : std::enable_shared_from_this { private: CzarStats(std::shared_ptr const& qdispPool); - static Ptr _globalCzarStats; ///< Pointer to the global instance. - static MUTEX _globalMtx; ///< Protects `_globalCzarStats` + static Ptr _globalCzarStats; ///< Pointer to the global instance. + static MUTEX _globalMtx; ///< Protects `_globalCzarStats` /// Connection to get information about the czar's pool of dispatch threads. std::shared_ptr _qdispPool; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 83bbaadca..9df2bbf08 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -89,12 +89,12 @@ namespace lsst::qserv::qdisp { //////////////////////////////////////////////////////////////////////// // class Executive implementation //////////////////////////////////////////////////////////////////////// -Executive::Executive(ExecutiveConfig const& c, shared_ptr const& ms, - SharedResources::Ptr const& sharedResources, shared_ptr const& qStatus, +Executive::Executive(ExecutiveConfig const& cfg, shared_ptr const& ms, + util::QdispPool::Ptr const& qdispPool, shared_ptr const& qStatus, shared_ptr const& querySession) - : _config(c), + : _config(cfg), _messageStore(ms), - _qdispPool(sharedResources->getQdispPool()), + _qdispPool(qdispPool), _qMeta(qStatus), _querySession(querySession) { _secondsBetweenQMetaUpdates = chrono::seconds(_config.secondsBetweenChunkUpdates); @@ -107,7 +107,8 @@ Executive::~Executive() { qdisp::CzarStats::get()->deleteQuery(); qdisp::CzarStats::get()->deleteJobs(_incompleteJobs.size()); // Remove this executive from the map. - if (czar::Czar::getCzar()->getExecutiveFromMap(getId()) != nullptr) { + auto cz = czar::Czar::getCzar(); // cz can be null in unit tests. + if (cz != nullptr && cz->getExecutiveFromMap(getId()) != nullptr) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) + " pointer in map should be invalid QID=" << getId()); } if (_asyncTimer != nullptr) { @@ -117,12 +118,12 @@ Executive::~Executive() { } Executive::Ptr Executive::create(ExecutiveConfig const& c, shared_ptr const& ms, - SharedResources::Ptr const& sharedResources, + std::shared_ptr const& qdispPool, shared_ptr const& qMeta, shared_ptr const& querySession, boost::asio::io_service& asioIoService) { LOGS(_log, LOG_LVL_DEBUG, "Executive::" << __func__); - Executive::Ptr ptr(new Executive(c, ms, sharedResources, qMeta, querySession)); + Executive::Ptr ptr(new Executive(c, ms, qdispPool, qMeta, querySession)); // Start the query progress monitoring timer (if enabled). The query status // will be sampled on each expiration event of the timer. Note that the timer @@ -195,7 +196,7 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { QSERV_LOGCONTEXT_QUERY_JOB(jobQuery->getQueryId(), jobQuery->getJobId()); { - lock_guard lock(_cancelled.getMutex()); + lock_guard lock(_cancelled.getMutex()); if (_cancelled) { LOGS(_log, LOG_LVL_DEBUG, "Executive already cancelled, ignoring add(" << jobDesc->id() << ")"); @@ -234,7 +235,7 @@ void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { } } -void Executive::runUberJob(std::shared_ptr const& uberJob) { +void Executive::queueUberJob(std::shared_ptr const& uberJob) { auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); @@ -276,27 +277,7 @@ void Executive::addUberJobs(std::vector> const& uJobsTo for (auto const& uJob : uJobsToAdd) { UberJobId ujId = uJob->getJobId(); _uberJobsMap[ujId] = uJob; - } -} - -void Executive::killIncompleteUberJobsOn(std::string const& restartedWorkerId) { - // Work with a copy to reduce lock time. - std::map> ujobsMap; - { - lock_guard lck(_uberJobsMapMtx); - ujobsMap = _uberJobsMap; - } - for (auto&& [ujKey, uj] : ujobsMap) { - if (uj == nullptr) continue; - auto wContactInfo = uj->getWorkerContactInfo(); - if (wContactInfo->wId == restartedWorkerId) { - if (uj->getStatus()->getState() != qmeta::JobStatus::COMPLETE) { - // All jobs in the uberjob will be set as unassigned, which - // will lead to Czar::_monitor() reassigning them to new - // UberJobs. (Unless this query was cancelled.) - uj->killUberJob(); - } - } + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uJob->getJobCount()); } } @@ -351,6 +332,7 @@ bool Executive::join() { // To join, we make sure that all of the chunks added so far are complete. // Check to see if _requesters is empty, if not, then sleep on a condition. _waitAllUntilEmpty(); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " wait done"); // Okay to merge. probably not the Executive's responsibility struct successF { static bool func(Executive::JobMap::value_type const& entry) { @@ -380,7 +362,9 @@ bool Executive::join() { } _empty = (sCount == _requestCount); LOGS(_log, LOG_LVL_DEBUG, - "Flag set to _empty=" << _empty << ", sCount=" << sCount << ", requestCount=" << _requestCount); + cName(__func__) << " " + << "Flag set to _empty=" << _empty << ", sCount=" << sCount + << ", requestCount=" << _requestCount); return _empty || isRowLimitComplete(); } @@ -498,7 +482,10 @@ void Executive::_squashSuperfluous() { void Executive::sendWorkersEndMsg(bool deleteResults) { LOGS(_log, LOG_LVL_INFO, cName(__func__) << " terminating this query deleteResults=" << deleteResults); - czar::Czar::getCzar()->getCzarRegistry()->endUserQueryOnWorkers(_id, deleteResults); + auto cz = czar::Czar::getCzar(); + if (cz != nullptr) { // Possible in unit tests. + cz->getCzarRegistry()->endUserQueryOnWorkers(_id, deleteResults); + } } void Executive::killIncompleteUberJobsOnWorker(std::string const& workerId) { diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 9c76dae11..12a8e4fc4 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -41,7 +41,6 @@ #include "global/stringTypes.h" #include "qdisp/JobDescription.h" #include "qdisp/ResponseHandler.h" -#include "qdisp/SharedResources.h" #include "qdisp/UberJob.h" #include "qmeta/JobStatus.h" #include "util/EventThread.h" @@ -105,14 +104,16 @@ class Executive : public std::enable_shared_from_this { /// If c->serviceUrl == ExecutiveConfig::getMockStr(), then use XrdSsiServiceMock /// instead of a real XrdSsiService static Executive::Ptr create(ExecutiveConfig const& c, std::shared_ptr const& ms, - SharedResources::Ptr const& sharedResources, + std::shared_ptr const& qdispPool, std::shared_ptr const& qMeta, std::shared_ptr const& querySession, boost::asio::io_service& asioIoService); - ~Executive(); + virtual ~Executive(); - std::string cName(const char* funcName = "") { return std::string("Executive::") + funcName; } + std::string cName(const char* funcName = "") { + return std::string("Executive::") + funcName + " " + getIdStr(); + } /// Set the UserQuerySelect object for this query so this Executive can ask it to make new /// UberJobs in the future, if needed. @@ -128,7 +129,7 @@ class Executive : public std::enable_shared_from_this { std::shared_ptr add(JobDescription::Ptr const& s); // Queue `uberJob` to be run using the QDispPool. - void runUberJob(std::shared_ptr const& uberJob); + void queueUberJob(std::shared_ptr const& uberJob); /// Queue `cmd`, using the QDispPool, so it can be used to collect the result file. void queueFileCollect(std::shared_ptr const& cmd); @@ -147,9 +148,6 @@ class Executive : public std::enable_shared_from_this { /// Squash all the jobs. void squash(); - /// &&& doc - void killIncompleteUberJobsOnWorker(std::string const& workerId); - bool getEmpty() { return _empty; } /// These values cannot be set until information has been collected from @@ -203,7 +201,7 @@ class Executive : public std::enable_shared_from_this { /// Call UserQuerySelect::buildAndSendUberJobs make new UberJobs for /// unassigned jobs. - void assignJobsToUberJobs(); + virtual void assignJobsToUberJobs(); int getTotalJobs() { return _totalJobs; } @@ -226,14 +224,17 @@ class Executive : public std::enable_shared_from_this { /// @param deleteResults - If true, delete all result files for this query on the workers. void sendWorkersEndMsg(bool deleteResults); - /// &&& doc - void killIncompleteUberJobsOn(std::string const& restartedWorkerId); + /// Complete UberJobs have their results on the czar, the + /// incomplete UberJobs need to be stopped and possibly reassigned. + void killIncompleteUberJobsOnWorker(std::string const& workerId); -private: - Executive(ExecutiveConfig const& c, std::shared_ptr const& ms, - SharedResources::Ptr const& sharedResources, std::shared_ptr const& qStatus, +protected: + Executive(ExecutiveConfig const& cfg, std::shared_ptr const& ms, + std::shared_ptr const& sharedResources, + std::shared_ptr const& qStatus, std::shared_ptr const& querySession); +private: void _setupLimit(); bool _track(int refNum, std::shared_ptr const& r); diff --git a/src/qdisp/JobBase.cc b/src/qdisp/JobBase.cc deleted file mode 100644 index a5ef5a8c8..000000000 --- a/src/qdisp/JobBase.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "qdisp/JobBase.h" - -// System headers -#include - -// Qserv headers - -// LSST headers -#include "lsst/log/Log.h" - -using namespace std; - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.JobBase"); -} - -namespace lsst { namespace qserv { namespace qdisp { - -std::ostream& JobBase::dumpOS(std::ostream& os) const { - os << "JobBase no data members"; - return os; -} - -std::string JobBase::dump() const { - std::ostringstream os; - dumpOS(os); - return os.str(); -} - -std::ostream& operator<<(std::ostream& os, JobBase const& jb) { return jb.dumpOS(os); } - -}}} // namespace lsst::qserv::qdisp diff --git a/src/qdisp/JobBase.h b/src/qdisp/JobBase.h deleted file mode 100644 index a030d1612..000000000 --- a/src/qdisp/JobBase.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_QDISP_JOBBASE_H -#define LSST_QSERV_QDISP_JOBBASE_H - -// System headers -#include -#include - -// Qserv headers -#include "global/intTypes.h" - -namespace lsst::qserv::qmeta { -class JobStatus; -} - -// This header declarations -namespace lsst::qserv::qdisp { - -class Executive; -class QdispPool; -class ResponseHandler; -class QueryRequest; - -/// Base class for JobQuery and UberJob. -/// TODO:UJ This could use a lot of cleanup. Once UberJobs are fully in effect, there's no need -/// for this base class as it won't be possible to send a JobQuery to a worker without -/// putting it in an UberJob first. The UberJob is a wrapper that stores worker contact -/// info. -// &&& delete this class as JobQuery and UberJob should no longer have much in common -class JobBase : public std::enable_shared_from_this { -public: - using Ptr = std::shared_ptr; - - JobBase() = default; - JobBase(JobBase const&) = delete; - JobBase& operator=(JobBase const&) = delete; - virtual ~JobBase() = default; - - virtual QueryId getQueryId() const = 0; - virtual UberJobId getJobId() const = 0; - virtual std::string const& getIdStr() const = 0; - virtual std::shared_ptr getRespHandler() = 0; - virtual std::shared_ptr getStatus() = 0; - virtual bool getScanInteractive() const = 0; - virtual bool isQueryCancelled() = 0; - virtual void callMarkCompleteFunc(bool success) = 0; - virtual std::shared_ptr getExecutive() = 0; - - virtual std::ostream& dumpOS(std::ostream& os) const; - - std::string dump() const; - friend std::ostream& operator<<(std::ostream& os, JobBase const& jb); -}; - -} // namespace lsst::qserv::qdisp - -#endif // LSST_QSERV_QDISP_JOBBASE_H diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index e0f8ecfba..62e281d59 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -45,8 +45,7 @@ namespace lsst::qserv::qdisp { JobQuery::JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, qmeta::JobStatus::Ptr const& jobStatus, QueryId qid) - : JobBase(), - _executive(executive), + : _executive(executive), _jobDescription(jobDescription), _jobStatus(jobStatus), _qid(qid), @@ -64,11 +63,9 @@ bool JobQuery::cancel(bool superfluous) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel()"); if (_cancelled.exchange(true) == false) { - lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + lock_guard lock(_jqMtx); - //&&&bool cancelled = false; - - //&&&if (!cancelled) { ostringstream os; os << _idStr << " cancel"; LOGS(_log, LOG_LVL_DEBUG, os.str()); @@ -81,7 +78,6 @@ bool JobQuery::cancel(bool superfluous) { return false; } executive->markCompleted(getJobId(), false); - //&&&} if (!superfluous) { _jobDescription->respHandler()->processCancel(); } @@ -107,6 +103,7 @@ bool JobQuery::isQueryCancelled() { bool JobQuery::_setUberJobId(UberJobId ujId) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); + VMUTEX_HELD(_jqMtx); if (_uberJobId >= 0 && ujId != _uberJobId) { LOGS(_log, LOG_LVL_DEBUG, __func__ << " couldn't change UberJobId as ujId=" << ujId << " is owned by " << _uberJobId); @@ -118,7 +115,8 @@ bool JobQuery::_setUberJobId(UberJobId ujId) { bool JobQuery::unassignFromUberJob(UberJobId ujId) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); - std::lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + lock_guard lock(_jqMtx); if (_uberJobId < 0) { LOGS(_log, LOG_LVL_INFO, __func__ << " UberJobId already unassigned. attempt by ujId=" << ujId); return true; @@ -137,16 +135,21 @@ bool JobQuery::unassignFromUberJob(UberJobId ujId) { } int JobQuery::getAttemptCount() const { - std::lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + lock_guard lock(_jqMtx); return _jobDescription->getAttemptCount(); } -void JobQuery::callMarkCompleteFunc(bool success) { - throw util::Bug(ERR_LOC, "&&& JobQuery::callMarkCompleteFunc should not be called, ever"); -} - ostream& JobQuery::dumpOS(ostream& os) const { return os << "{" << getIdStr() << _jobDescription << " " << _jobStatus << "}"; } +std::string JobQuery::dump() const { + std::ostringstream os; + dumpOS(os); + return os.str(); +} + +std::ostream& operator<<(std::ostream& os, JobQuery const& jq) { return jq.dumpOS(os); } + } // namespace lsst::qserv::qdisp diff --git a/src/qdisp/JobQuery.h b/src/qdisp/JobQuery.h index 9a8e13962..7c22d7f74 100644 --- a/src/qdisp/JobQuery.h +++ b/src/qdisp/JobQuery.h @@ -33,20 +33,18 @@ // Qserv headers #include "qdisp/Executive.h" -#include "qdisp/JobBase.h" #include "qdisp/JobDescription.h" #include "qdisp/ResponseHandler.h" #include "util/InstanceCount.h" +#include "util/Mutex.h" namespace lsst::qserv::qdisp { class QueryRequest; -/// This class is used to describe, monitor, and control a single query to a worker. -/// TODO:UJ once all Jobs are sent out as UberJobs, the purpose of this class is a bit -/// vague. It's components should probably be split between UberJob and -/// JobDescription. -class JobQuery : public JobBase { +/// This class is used to describe and monitor the queries for a +/// chunk on the worker. +class JobQuery { public: typedef std::shared_ptr Ptr; @@ -54,48 +52,39 @@ class JobQuery : public JobBase { static JobQuery::Ptr create(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, qmeta::JobStatus::Ptr const& jobStatus, QueryId qid) { Ptr jq = Ptr(new JobQuery(executive, jobDescription, jobStatus, qid)); - jq->_setup(); return jq; } virtual ~JobQuery(); - QueryId getQueryId() const override { return _qid; } - JobId getJobId() const override { return _jobDescription->id(); } - std::string const& getIdStr() const override { return _idStr; } - std::shared_ptr getRespHandler() override { return _jobDescription->respHandler(); } - bool getScanInteractive() const override { return _jobDescription->getScanInteractive(); } + QueryId getQueryId() const { return _qid; } + JobId getJobId() const { return _jobDescription->id(); } + std::string const& getIdStr() const { return _idStr; } + std::shared_ptr getRespHandler() { return _jobDescription->respHandler(); } JobDescription::Ptr getDescription() { return _jobDescription; } - - qmeta::JobStatus::Ptr getStatus() override { return _jobStatus; } - - void callMarkCompleteFunc(bool success) override; + qmeta::JobStatus::Ptr getStatus() { return _jobStatus; } bool cancel(bool superfluous = false); - bool isQueryCancelled() override; - - std::shared_ptr getExecutive() override { return _executive.lock(); } - - std::ostream& dumpOS(std::ostream& os) const override; + bool isQueryCancelled(); - /// Make a copy of the job description. JobQuery::_setup() must be called after creation. - /// Do not call this directly, use create. - JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, - qmeta::JobStatus::Ptr const& jobStatus, QueryId qid); + std::shared_ptr getExecutive() { return _executive.lock(); } /// If the UberJob is unassigned, change the _uberJobId to ujId. bool setUberJobId(UberJobId ujId) { - std::lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + std::lock_guard lock(_jqMtx); return _setUberJobId(ujId); } UberJobId getUberJobId() const { - std::lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + std::lock_guard lock(_jqMtx); return _getUberJobId(); } bool isInUberJob() const { - std::lock_guard lock(_rmutex); + VMUTEX_NOT_HELD(_jqMtx); + std::lock_guard lock(_jqMtx); return _isInUberJob(); } @@ -105,22 +94,32 @@ class JobQuery : public JobBase { /// @return true if job is unassigned. bool unassignFromUberJob(UberJobId ujId); + std::ostream& dumpOS(std::ostream& os) const; + std::string dump() const; + friend std::ostream& operator<<(std::ostream& os, JobQuery const& jq); + protected: - void _setup() { - JobBase::Ptr jbPtr = shared_from_this(); - _jobDescription->respHandler()->setJobQuery(jbPtr); - } + /// Make a copy of the job description. JobQuery::_setup() must be called after creation. + /// Do not call this directly, use create. + JobQuery(Executive::Ptr const& executive, JobDescription::Ptr const& jobDescription, + qmeta::JobStatus::Ptr const& jobStatus, QueryId qid); /// @return true if _uberJobId was set, it can only be set if it is unassigned /// or by the current owner. - /// NOTE: _rmutex must be held before calling this + /// NOTE: _jqMtx must be held before calling this bool _setUberJobId(UberJobId ujId); - /// NOTE: _rmutex must be held before calling this - UberJobId _getUberJobId() const { return _uberJobId; } + /// NOTE: _jqMtx must be held before calling this + UberJobId _getUberJobId() const { + VMUTEX_HELD(_jqMtx); + return _uberJobId; + } - /// NOTE: _rmutex must be held before calling this - bool _isInUberJob() const { return _uberJobId >= 0; } + /// NOTE: _jqMtx must be held before calling this + bool _isInUberJob() const { + VMUTEX_HELD(_jqMtx); + return _uberJobId >= 0; + } // Values that don't change once set. std::weak_ptr _executive; @@ -134,10 +133,7 @@ class JobQuery : public JobBase { std::string const _idStr; ///< Identifier string for logging. // Values that need mutex protection - // TODO:UJ recursive can probably go away with as well as _inSsi. - mutable std::recursive_mutex _rmutex; ///< protects _jobDescription, - ///< _queryRequestPtr, _uberJobId, - ///< and _inSsi + mutable MUTEX _jqMtx; ///< protects _jobDescription, _queryRequestPtr, _uberJobId // Cancellation std::atomic _cancelled{false}; ///< Lock to make sure cancel() is only called once. diff --git a/src/qdisp/ResponseHandler.h b/src/qdisp/ResponseHandler.h index 66c1d8dc8..03f22b18d 100644 --- a/src/qdisp/ResponseHandler.h +++ b/src/qdisp/ResponseHandler.h @@ -42,7 +42,8 @@ class ResponseSummary; namespace lsst::qserv::qdisp { -class JobBase; +class JobQuery; +class UberJob; /// ResponseHandler is an interface that handles result bytes. Tasks are /// submitted to an Executive instance naming a resource unit (what resource is @@ -57,15 +58,9 @@ class ResponseHandler { typedef std::shared_ptr Ptr; ResponseHandler() {} - void setJobQuery(std::shared_ptr const& jobBase) { _jobBase = jobBase; } + void setUberJob(std::weak_ptr const& ujPtr) { _uberJob = ujPtr; } virtual ~ResponseHandler() {} - /// Process a request for pulling and merging a job result into the result table - /// @param responseSummary - worker response to be analyzed and processed - /// @param resultRows - number of result rows in this result. - /// @return true if successful (no error) - virtual bool flush(proto::ResponseSummary const& responseSummary, uint32_t& resultRows) = 0; - /// Collect result data from the worker and merge it with the query result table. /// @return success - true if the operation was successful /// @return shouldCancel - if success was false, this being true indicates there @@ -80,10 +75,6 @@ class ResponseHandler { /// Signal an unrecoverable error condition. No further calls are expected. virtual void errorFlush(std::string const& msg, int code) = 0; - /// @return true if the receiver has completed its duties. - virtual bool finished() const = 0; - virtual bool reset() = 0; ///< Reset the state that a request can be retried. - /// Print a string representation of the receiver to an ostream virtual std::ostream& print(std::ostream& os) const = 0; @@ -96,10 +87,10 @@ class ResponseHandler { /// Scrub the results from jobId-attempt from the result table. virtual void prepScrubResults(int jobId, int attempt) = 0; - std::weak_ptr getJobBase() { return _jobBase; } + std::weak_ptr getUberJob() { return _uberJob; } private: - std::weak_ptr _jobBase; + std::weak_ptr _uberJob; }; inline std::ostream& operator<<(std::ostream& os, ResponseHandler const& r) { return r.print(os); } diff --git a/src/qdisp/SharedResources.h b/src/qdisp/SharedResources.h deleted file mode 100644 index 0bfadcebf..000000000 --- a/src/qdisp/SharedResources.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_SHAREDRESOURCES_H -#define LSST_QSERV_SHAREDRESOURCES_H - -// System headers -#include - -namespace lsst::qserv::util { // &&& delete -class QdispPool; -} - -namespace lsst::qserv::qdisp { - -/// Put resources that all Executives need to share in one class to reduce -/// the number of arguments passed. -/// This class should be kept simple so it can easily be included in headers -/// without undue compiler performances problems. -// &&& there's nothing in here but qdisppool!? Try to delete, but there -// &&& will probably be unit test issues. -class SharedResources { -public: - using Ptr = std::shared_ptr; - - static Ptr create(std::shared_ptr const& qdispPool) { - return Ptr(new SharedResources(qdispPool)); - } - - SharedResources() = delete; - SharedResources(SharedResources const&) = delete; - SharedResources& operator=(SharedResources const&) = delete; - ~SharedResources() = default; - - std::shared_ptr getQdispPool() { return _qdispPool; } //&&& delete - -private: - SharedResources(std::shared_ptr const& qdispPool) : _qdispPool(qdispPool) {} - - /// Thread pool for handling Responses from XrdSsi. - std::shared_ptr _qdispPool; -}; - -} // namespace lsst::qserv::qdisp - -#endif // LSST_QSERV_SHAREDRESOURCES_H diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index d2c14181e..ffedb593d 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -68,8 +68,7 @@ UberJob::Ptr UberJob::create(Executive::Ptr const& executive, UberJob::UberJob(Executive::Ptr const& executive, std::shared_ptr const& respHandler, int queryId, int uberJobId, qmeta::CzarId czarId, int rowLimit, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData) - : JobBase(), - _executive(executive), + : _executive(executive), _respHandler(respHandler), _queryId(queryId), _uberJobId(uberJobId), @@ -79,8 +78,8 @@ UberJob::UberJob(Executive::Ptr const& executive, std::shared_ptrsetJobQuery(jbPtr); + UberJob::Ptr ujPtr = shared_from_this(); + _respHandler->setUberJob(ujPtr); } bool UberJob::addJob(JobQuery::Ptr const& job) { @@ -97,7 +96,7 @@ bool UberJob::addJob(JobQuery::Ptr const& job) { return success; } -bool UberJob::runUberJob() { +void UberJob::runUberJob() { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); // Build the uberjob payload for each job. nlohmann::json uj; @@ -175,7 +174,7 @@ bool UberJob::runUberJob() { } else { setStatusIfOk(qmeta::JobStatus::REQUEST, cName(__func__) + " transmitSuccess"); // locks _jobsMtx } - return false; + return; } void UberJob::prepScrubResults() { @@ -217,15 +216,6 @@ bool UberJob::isQueryCancelled() { return exec->getCancelled(); } -bool UberJob::getScanInteractive() const { - auto exec = _executive.lock(); - if (exec == nullptr) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " _executive == nullptr"); - return false; // Safer to assume the worst. - } - return exec->getScanInteractive(); -} - bool UberJob::_setStatusIfOk(qmeta::JobStatus::State newState, string const& msg) { // must be locked _jobsMtx auto currentState = _jobStatus->getState(); @@ -254,7 +244,7 @@ bool UberJob::_setStatusIfOk(qmeta::JobStatus::State newState, string const& msg } void UberJob::callMarkCompleteFunc(bool success) { - LOGS(_log, LOG_LVL_DEBUG, "UberJob::callMarkCompleteFunc success=" << success); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " success=" << success); lock_guard lck(_jobsMtx); // Need to set this uberJob's status, however exec->markCompleted will set @@ -311,16 +301,15 @@ json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_ return _importResultError(false, "setStatusFail", "could not set status to RESPONSE_READY"); } - JobBase::Ptr jBaseThis = shared_from_this(); - weak_ptr ujThis = std::dynamic_pointer_cast(jBaseThis); - + weak_ptr ujThis = weak_from_this(); // TODO:UJ lambda may not be the best way to do this, alsocheck synchronization - may need a mutex for // merging. - auto fileCollectFunc = [ujThis, fileUrl, rowCount](util::CmdData*) { + string const idStr = _idStr; + auto fileCollectFunc = [ujThis, fileUrl, rowCount, idStr](util::CmdData*) { auto ujPtr = ujThis.lock(); if (ujPtr == nullptr) { LOGS(_log, LOG_LVL_DEBUG, - "UberJob::importResultFile::fileCollectFunction uberjob ptr is null " << fileUrl); + "UberJob::fileCollectFunction uberjob ptr is null " << idStr << " " << fileUrl); return; } uint64_t resultRows = 0; @@ -424,8 +413,15 @@ json UberJob::_importResultError(bool shouldCancel, string const& errorType, str return jsRet; } -nlohmann::json UberJob::_importResultFinish(uint64_t resultRows) { +void UberJob::_importResultFinish(uint64_t resultRows) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); + + auto exec = _executive.lock(); + if (exec == nullptr) { + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " executive is null"); + return; + } + /// If this is called, the file has been collected and the worker should delete it /// /// This function should call markComplete for all jobs in the uberjob @@ -433,22 +429,16 @@ nlohmann::json UberJob::_importResultFinish(uint64_t resultRows) { bool const statusSet = setStatusIfOk(qmeta::JobStatus::RESPONSE_DONE, getIdStr() + " _importResultFinish"); if (!statusSet) { - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " failed to set status " << getIdStr()); - return {{"success", 0}, {"errortype", "statusMismatch"}, {"note", "failed to set status"}}; - } - auto exec = _executive.lock(); - if (exec == nullptr) { - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " executive is null"); - return {{"success", 0}, {"errortype", "cancelled"}, {"note", "executive is null"}}; + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " failed to set status, squashing " << getIdStr()); + // Something has gone very wrong + exec->squash(); + return; } bool const success = true; callMarkCompleteFunc(success); // sets status to COMPLETE exec->addResultRows(resultRows); exec->checkLimitRowComplete(); - - json jsRet = {{"success", 1}, {"errortype", ""}, {"note", ""}}; - return jsRet; } nlohmann::json UberJob::_workerErrorFinish(bool deleteData, std::string const& errorType, @@ -510,4 +500,12 @@ std::ostream& UberJob::dumpOS(std::ostream& os) const { return os; } +std::string UberJob::dump() const { + std::ostringstream os; + dumpOS(os); + return os.str(); +} + +std::ostream& operator<<(std::ostream& os, UberJob const& uj) { return uj.dumpOS(os); } + } // namespace lsst::qserv::qdisp diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index 0015a772a..fe24da7ce 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -27,7 +27,7 @@ #include "qmeta/types.h" #include "czar/CzarChunkMap.h" // Need nested class. TODO:UJ Make non-nested? #include "czar/CzarRegistry.h" // Need nested class. TODO:UJ Make non-nested? -#include "qdisp/JobBase.h" +#include "qdisp/Executive.h" #include "qmeta/JobStatus.h" namespace lsst::qserv::util { @@ -46,7 +46,7 @@ class JobQuery; /// When this UberJobCompletes, all the Jobs it contains are registered as completed. /// If this UberJob fails, it will be destroyed, un-assigning all of its Jobs. /// Those Jobs will need to be reassigned to new UberJobs, or the query cancelled. -class UberJob : public JobBase { +class UberJob : public std::enable_shared_from_this { public: using Ptr = std::shared_ptr; @@ -63,24 +63,23 @@ class UberJob : public JobBase { std::string cName(const char* funcN) const { return std::string("UberJob::") + funcN + " " + getIdStr(); } bool addJob(std::shared_ptr const& job); - bool runUberJob(); - /// &&&doc + /// Make a json version of this UberJob and send it to its worker. + virtual void runUberJob(); + + /// Kill this UberJob and unassign all Jobs so they can be used in a new UberJob if needed. void killUberJob(); - QueryId getQueryId() const override { return _queryId; } - UberJobId getJobId() const override { + QueryId getQueryId() const { return _queryId; } + UberJobId getJobId() const { return _uberJobId; - } // TODO:UJ change name when JobBase no longer needed. - std::string const& getIdStr() const override { return _idStr; } - std::shared_ptr getRespHandler() override { return _respHandler; } - std::shared_ptr getStatus() override { - return _jobStatus; - } // TODO:UJ relocate to JobBase - bool getScanInteractive() const override; ///< probably not called TODO:UJ - bool isQueryCancelled() override; // TODO:UJ relocate to JobBase - void callMarkCompleteFunc(bool success) override; ///< call markComplete for all jobs in this UberJob. - std::shared_ptr getExecutive() override { return _executive.lock(); } + } // &&& TODO:UJ change name when JobBase no longer needed. + std::string const& getIdStr() const { return _idStr; } + std::shared_ptr getRespHandler() { return _respHandler; } + std::shared_ptr getStatus() { return _jobStatus; } + bool isQueryCancelled(); + void callMarkCompleteFunc(bool success); ///< call markComplete for all jobs in this UberJob. + std::shared_ptr getExecutive() { return _executive.lock(); } /// Return false if not ok to set the status to newState, otherwise set the state for /// this UberJob and all jobs it contains to newState. @@ -114,13 +113,16 @@ class UberJob : public JobBase { /// Handle an error from the worker. nlohmann::json workerError(int errorCode, std::string const& errorMsg); - std::ostream& dumpOS(std::ostream& os) const override; + std::ostream& dumpOS(std::ostream& os) const; + std::string dump() const; + friend std::ostream& operator<<(std::ostream& os, UberJob const& uj); -private: +protected: UberJob(std::shared_ptr const& executive, std::shared_ptr const& respHandler, int queryId, int uberJobId, qmeta::CzarId czarId, int rowLimit, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData); +private: /// Used to setup elements that can't be done in the constructor. void _setup(); @@ -138,7 +140,7 @@ class UberJob : public JobBase { std::string const& note); /// Let the executive know that all Jobs in UberJob are complete. - nlohmann::json _importResultFinish(uint64_t resultRows); + void _importResultFinish(uint64_t resultRows); /// Let the Executive know about errors while handling results. nlohmann::json _workerErrorFinish(bool successful, std::string const& errorType = std::string(), diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index d3d2fa9f6..1afb8712f 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -38,9 +38,9 @@ // Qserv headers #include "ccontrol/MergingHandler.h" #include "global/ResourceUnit.h" +#include "qdisp/CzarStats.h" #include "qdisp/Executive.h" #include "qdisp/JobQuery.h" -#include "qdisp/SharedResources.h" #include "qmeta/MessageStore.h" #include "qproc/ChunkQuerySpec.h" #include "qproc/TaskMsgFactory.h" @@ -49,13 +49,14 @@ namespace test = boost::test_tools; using namespace lsst::qserv; +using namespace std; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.testQDisp"); } typedef util::Sequential SequentialInt; -typedef std::vector RequesterVector; +typedef vector RequesterVector; namespace lsst::qserv::qproc { @@ -66,18 +67,139 @@ class MockTaskMsgFactory : public TaskMsgFactory { public: MockTaskMsgFactory(std::string const& mockPayload_) : TaskMsgFactory(), mockPayload(mockPayload_) {} - std::shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, - qmeta::CzarId czarId) override { + shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, + QueryId queryId, int jobId, int attemptCount, + qmeta::CzarId czarId) override { return jsPtr; } - std::string mockPayload; - std::shared_ptr jsPtr; + string mockPayload; + shared_ptr jsPtr; }; } // namespace lsst::qserv::qproc +namespace lsst::qserv::qdisp { + +class ExecutiveUT; + +class TestInfo : public ResponseHandler { +public: + using Ptr = std::shared_ptr; + + TestInfo() {} + virtual ~TestInfo() {} + + bool goWait() { + unique_lock ulock(_infoMtx); + _infoCV.wait(ulock, [this]() { return _go == true; }); + return _ok; + } + + void setGo(bool val) { + lock_guard lg(_infoMtx); + _go = val; + _infoCV.notify_all(); + } + + // virtual function that won't be needed + std::tuple flushHttp(std::string const& fileUrl, uint64_t expectedRows, + uint64_t& resultRows) override { + return {true, false}; + } + void flushHttpError(int errorCode, std::string const& errorMsg, int status) override {} + void errorFlush(std::string const& msg, int code) override{}; + Error getError() const override { return util::Error(); } + void processCancel() override{}; + void prepScrubResults(int jobId, int attempt) override{}; + + /// Print a string representation of the receiver to an ostream + std::ostream& print(std::ostream& os) const override { + os << "TestInfo ujCount=" << ujCount; + return os; + } + + atomic ujCount = 0; + +private: + bool _ok = true; + bool _go = true; + mutex _infoMtx; + condition_variable _infoCV; +}; + +/// Version of UberJob specifically for this unit test. +class UberJobUT : public UberJob { +public: + using PtrUT = std::shared_ptr; + + UberJobUT(std::shared_ptr const& executive, + std::shared_ptr const& respHandler, int queryId, int uberJobId, + qmeta::CzarId czarId, int rowLimit, czar::CzarChunkMap::WorkerChunksData::Ptr const& workerData, + TestInfo::Ptr const& testInfo_) + : UberJob(executive, respHandler, queryId, uberJobId, czarId, rowLimit, workerData), + testInfo(testInfo_) {} + + void runUberJob() override { + LOGS(_log, LOG_LVL_INFO, "runUberJob() chunkId=" << chunkId); + bool ok = testInfo->goWait(); + int c = -1; + if (ok) { + c = ++(testInfo->ujCount); + } + callMarkCompleteFunc(ok); + LOGS(_log, LOG_LVL_INFO, "runUberJob() end chunkId=" << chunkId << " c=" << c); + } + + TestInfo::Ptr testInfo; + int chunkId = -1; +}; + +/// Version of Executive specifically for this unit test. +class ExecutiveUT : public Executive { +public: + using PtrUT = shared_ptr; + + ~ExecutiveUT() override = default; + + ExecutiveUT(ExecutiveConfig const& cfg, shared_ptr const& ms, + util::QdispPool::Ptr const& qdispPool, shared_ptr const& qStatus, + shared_ptr const& querySession, TestInfo::Ptr const& testInfo_) + : Executive(cfg, ms, qdispPool, qStatus, querySession), testInfo(testInfo_) {} + + void assignJobsToUberJobs() override { + vector ujVect; + + // Make an UberJobUnitTest for each job + qdisp::Executive::ChunkIdJobMapType unassignedChunks = unassignedChunksInQuery(); + for (auto const& [chunkId, jqPtr] : unassignedChunks) { + auto exec = shared_from_this(); + PtrUT execUT = dynamic_pointer_cast(exec); + auto uJob = UberJobUT::PtrUT(new UberJobUT(execUT, testInfo, getId(), ujId++, czarId, rowLimit, + targetWorker, testInfo)); + uJob->chunkId = chunkId; + uJob->addJob(jqPtr); + ujVect.push_back(uJob); + } + + // Queue up the jobs to be run. + addUberJobs(ujVect); + for (auto const& ujPtr : ujVect) { + queueUberJob(ujPtr); + } + LOGS(_log, LOG_LVL_INFO, "assignJobsToUberJobs() end"); + } + + CzarIdType czarId = 1; + UberJobId ujId = 1; + int rowLimit = 0; + czar::CzarChunkMap::WorkerChunksData::Ptr targetWorker = nullptr; + + TestInfo::Ptr testInfo; +}; + +} // namespace lsst::qserv::qdisp + qdisp::JobDescription::Ptr makeMockJobDescription(qdisp::Executive::Ptr const& ex, int sequence, ResourceUnit const& ru, std::string msg, std::shared_ptr const& mHandler) { @@ -94,13 +216,15 @@ qdisp::JobDescription::Ptr makeMockJobDescription(qdisp::Executive::Ptr const& e // that we return a shared pointer to the last constructed JobQuery object. // This only makes sense for single query jobs. // + std::shared_ptr addMockRequests(qdisp::Executive::Ptr const& ex, SequentialInt& sequence, - int chunkID, std::string msg, RequesterVector& rv) { - ResourceUnit ru; + int startingChunkId, std::string msg, RequesterVector& rv) { std::shared_ptr jobQuery; int copies = rv.size(); - ru.setAsDbChunk("Mock", chunkID); for (int j = 0; j < copies; ++j) { + ResourceUnit ru; + int chunkId = startingChunkId + j; + ru.setAsDbChunk("Mock", chunkId); // The job copies the JobDescription. qdisp::JobDescription::Ptr job = makeMockJobDescription(ex, sequence.incr(), ru, msg, rv[j]); jobQuery = ex->add(job); @@ -108,12 +232,9 @@ std::shared_ptr addMockRequests(qdisp::Executive::Ptr const& ex return jobQuery; } -/** Start adds 'copies' number of test requests that each sleep for 'millisecs' time - * before signaling to 'ex' that they are done. - * Returns time to complete in seconds. - */ -std::shared_ptr executiveTest(qdisp::Executive::Ptr const& ex, SequentialInt& sequence, +std::shared_ptr executiveTest(qdisp::ExecutiveUT::PtrUT const& ex, SequentialInt& sequence, int chunkId, std::string msg, int copies) { + LOGS(_log, LOG_LVL_INFO, "executiveTest start"); // Test class Executive::add // Modeled after ccontrol::UserQuery::submit() ResourceUnit ru; @@ -125,14 +246,17 @@ std::shared_ptr executiveTest(qdisp::Executive::Ptr const& ex, for (int j = 0; j < copies; ++j) { rv.push_back(mh); } - return addMockRequests(ex, sequence, chunkId, msg, rv); + auto ret = addMockRequests(ex, sequence, chunkId, msg, rv); + ex->assignJobsToUberJobs(); + LOGS(_log, LOG_LVL_INFO, "executiveTest end"); + return ret; } /** This function is run in a separate thread to fail the test if it takes too long * for the jobs to complete. */ void timeoutFunc(std::atomic& flagDone, int millisecs) { - LOGS_DEBUG("timeoutFunc"); + LOGS_INFO("timeoutFunc"); int total = 0; bool done = flagDone; int maxTime = millisecs * 1000; @@ -141,7 +265,7 @@ void timeoutFunc(std::atomic& flagDone, int millisecs) { total += sleepTime; usleep(sleepTime); done = flagDone; - LOGS_DEBUG("timeoutFunc done=" << done << " total=" << total); + LOGS_INFO("timeoutFunc done=" << done << " total=" << total); } LOGS_ERROR("timeoutFunc done=" << done << " total=" << total << " timedOut=" << (total >= maxTime)); BOOST_REQUIRE(done == true); @@ -157,21 +281,21 @@ class SetupTest { qdisp::ExecutiveConfig::Ptr conf; std::shared_ptr ms; util::QdispPool::Ptr qdispPool; - qdisp::SharedResources::Ptr sharedResources; - qdisp::Executive::Ptr ex; + qdisp::ExecutiveUT::PtrUT ex; std::shared_ptr jqTest; // used only when needed - boost::asio::io_service asioIoService; + qdisp::TestInfo::Ptr testInfo = qdisp::TestInfo::Ptr(new qdisp::TestInfo()); - SetupTest(const char* request) { + SetupTest(const char* request, util::QdispPool::Ptr const& qPool_) : qdispPool(qPool_) { + LOGS(_log, LOG_LVL_INFO, "SetupTest start"); qrMsg = request; str = qdisp::ExecutiveConfig::getMockStr(); conf = std::make_shared(str, 0); // No updating of QMeta. ms = std::make_shared(); - qdispPool = std::make_shared(true); - sharedResources = qdisp::SharedResources::create(qdispPool); - + auto tInfo = qdisp::TestInfo::Ptr(new qdisp::TestInfo()); std::shared_ptr qStatus; // No updating QStatus, nullptr - ex = qdisp::Executive::create(*conf, ms, sharedResources, qStatus, nullptr, asioIoService); + ex = qdisp::ExecutiveUT::PtrUT( + new qdisp::ExecutiveUT(*conf, ms, qdispPool, qStatus, nullptr, testInfo)); + LOGS(_log, LOG_LVL_INFO, "SetupTest end"); } ~SetupTest() {} }; @@ -185,7 +309,19 @@ BOOST_AUTO_TEST_SUITE(Suite) int chunkId = 1234; int millisInt = 50000; +util::QdispPool::Ptr globalQdispPool; +qdisp::CzarStats::Ptr globalCzarStats; + BOOST_AUTO_TEST_CASE(Executive) { + int qPoolSize = 1000; + int maxPriority = 2; + vector vectRunSizes = {50, 50, 50, 50}; + vector vectMinRunningSizes = {0, 1, 3, 3}; + globalQdispPool = util::QdispPool::Ptr( + new util::QdispPool(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes)); + qdisp::CzarStats::setup(globalQdispPool); + globalCzarStats = qdisp::CzarStats::get(); + // Variables for all executive sub-tests. Note that all executive tests // are full roundtrip tests. So, if these succeed then it's likely all // other query tests will succeed. So, much of this is redundant. @@ -197,52 +333,52 @@ BOOST_AUTO_TEST_CASE(Executive) { // Test single instance { - LOGS_DEBUG("Executive single query test"); - SetupTest tEnv("respdata"); + LOGS_INFO("Executive single query test"); + SetupTest tEnv("respdata", globalQdispPool); SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); jobs = 1; - LOGS_DEBUG("jobs=1"); + LOGS_INFO("jobs=1"); tEnv.ex->join(); - LOGS_DEBUG("Executive single query test checking"); + LOGS_INFO("Executive single query test checking"); BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::COMPLETE); BOOST_CHECK(tEnv.ex->getEmpty() == true); } // Test 4 jobs { - LOGS_DEBUG("Executive four parallel jobs test"); - SetupTest tEnv("respdata"); + LOGS_INFO("Executive four parallel jobs test"); + SetupTest tEnv("respdata", globalQdispPool); SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 4); jobs += 4; - LOGS_DEBUG("ex->joining()"); + LOGS_INFO("ex->joining()"); tEnv.ex->join(); - LOGS_DEBUG("Executive four parallel jobs test checking"); + LOGS_INFO("Executive four parallel jobs test checking"); BOOST_CHECK(tEnv.ex->getEmpty() == true); } // Test that we can detect ex._empty == false. { - LOGS_DEBUG("Executive detect non-empty job queue test"); - SetupTest tEnv("respdata"); + LOGS_INFO("Executive detect non-empty job queue test"); + SetupTest tEnv("respdata", globalQdispPool); SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 5); jobs += 5; BOOST_CHECK(tEnv.ex->getEmpty() == false); - LOGS_DEBUG("ex->joining()"); + LOGS_INFO("ex->joining()"); tEnv.ex->join(); - LOGS_DEBUG("ex->join() joined"); + LOGS_INFO("ex->join() joined"); BOOST_CHECK(tEnv.ex->getEmpty() == true); } done = true; timeoutT.join(); - LOGS_DEBUG("Executive test end"); + LOGS_INFO("Executive test end"); } BOOST_AUTO_TEST_CASE(MessageStore) { - LOGS_DEBUG("MessageStore test start"); + LOGS_INFO("MessageStore test start"); qmeta::MessageStore ms; BOOST_CHECK(ms.messageCount() == 0); ms.addMessage(123, "EXECUTIVE", 456, "test1"); @@ -253,112 +389,37 @@ BOOST_AUTO_TEST_CASE(MessageStore) { BOOST_CHECK(ms.messageCount(-12) == 2); qmeta::QueryMessage qm = ms.getMessage(1); BOOST_CHECK(qm.chunkId == 124 && qm.code == -12 && str.compare(qm.description) == 0); - LOGS_DEBUG("MessageStore test end"); -} - -BOOST_AUTO_TEST_CASE(QueryRequest) { - { - LOGS_DEBUG("QueryRequest error retry test"); - // Setup Executive and for retry test when receiving an error - // Note executive maps RESPONSE_ERROR to RESULT_ERROR - SetupTest tEnv("resperror"); - SequentialInt sequence(0); - tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); - tEnv.ex->join(); - BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() > 1); // Retried, eh? - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == qdisp::XrdSsiServiceMock::getReqCount()); - } - - { - LOGS_DEBUG("QueryRequest error noretry test 2"); - // Setup Executive and for no retry test when receiving an error - // Note executive maps RESPONSE_ERROR to RESULT_ERROR - SetupTest tEnv("resperrnr"); - SequentialInt sequence(0); - tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); - tEnv.ex->join(); - BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); - } - - { - LOGS_DEBUG("QueryRequest stream with data error test"); - // Setup Executive and for no retry test when receiving an error - // Note executive maps RESPONSE_DATA_NACK to RESULT_ERROR - SetupTest tEnv("respstrerr"); - SequentialInt sequence(0); - tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); - tEnv.ex->join(); - LOGS_DEBUG("tEnv.jqTest->...state = " << tEnv.jqTest->getStatus()->getInfo().state); - BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == qmeta::JobStatus::RESULT_ERROR); - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); // No retries! - } - - // We wish we could do the stream response with no results test but the - // needed information is too complex to figure out (well, one day we will). - // So, we've commented this out but the framework exists modulo the needed - // responses (see XrdSsiMocks::Agent). So, this gets punted into the - // integration test (too bad). - /* &&& check if this is possible - { - LOGS_DEBUG("QueryRequest stream with no results test"); - SetupTest tEnv("respstream"); - SequentialInt sequence(0); - tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); - tEnv.ex->join(); - BOOST_CHECK(tEnv.jqTest->getStatus()->getInfo().state == - qmeta::JobStatus::COMPLETE); - BOOST_CHECK(qdisp::XrdSsiServiceMock::getFinCount() == 1); - } - */ - LOGS_DEBUG("QueryRequest test end"); + LOGS_INFO("MessageStore test end"); } BOOST_AUTO_TEST_CASE(ExecutiveCancel) { // Test that aJobQuery can be cancelled and ends in correct state // { - LOGS_DEBUG("ExecutiveCancel: squash it test"); - SetupTest tEnv("respdata"); - //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before + LOGS_INFO("ExecutiveCancel: squash it test"); + SetupTest tEnv("respdata", globalQdispPool); + tEnv.testInfo->setGo(false); // Can't let jobs run or they are untracked before // squash SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); tEnv.ex->squash(); - //&&&qdisp::XrdSsiServiceMock::setGo(true); usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); BOOST_CHECK(tEnv.jqTest->isQueryCancelled() == true); - // Note that the query might not have actually called ProcessRequest() - // but if it did, then it must have called Finished() with cancel. - // - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); } // Test that multiple JobQueries are cancelled. { - LOGS_DEBUG("ExecutiveCancel: squash 20 test"); - SetupTest tEnv("respdata"); - //&&&qdisp::XrdSsiServiceMock::setGo(false); // Can't let jobs run or they are untracked before + LOGS_INFO("ExecutiveCancel: squash 20 test"); + SetupTest tEnv("respdata", globalQdispPool); // squash SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 20); tEnv.ex->squash(); tEnv.ex->squash(); // check that squashing twice doesn't cause issues. - //&&&qdisp::XrdSsiServiceMock::setGo(true); - usleep(250000); // Give mock threads a quarter second to complete. + usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); - // Note that the cancel count might not be 20 as some queries will cancel - // themselves before they get around to issuing ProcessRequest(). - // - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::getCanCount() == qdisp::XrdSsiServiceMock::getReqCount()); } } -BOOST_AUTO_TEST_CASE(ServiceMock) { - // Verify that our service object did not see anything unusual. - //&&&BOOST_CHECK(qdisp::XrdSsiServiceMock::isAOK()); -} - BOOST_AUTO_TEST_SUITE_END() diff --git a/src/qmeta/QMetaMysql.cc b/src/qmeta/QMetaMysql.cc index 3535c66fe..97a5797ac 100644 --- a/src/qmeta/QMetaMysql.cc +++ b/src/qmeta/QMetaMysql.cc @@ -43,6 +43,7 @@ #include "sql/SqlConnection.h" #include "sql/SqlConnectionFactory.h" #include "sql/SqlResults.h" +#include "util/TimeUtils.h" using namespace std; @@ -852,6 +853,8 @@ QMetaChunkMap QMetaMysql::getChunkMap(chrono::time_point c // Check if the table needs to be read. Note that the default value of // the previous update timestamp always forces an attempt to read the map. auto const updateTime = _getChunkMapUpdateTime(lock); + LOGS(_log, LOG_LVL_INFO, + "QMetaMysql::getChunkMap updateTime=" << util::TimeUtils::timePointToDateTimeString(updateTime)); bool const force = (prevUpdateTime == chrono::time_point()) || (prevUpdateTime < updateTime); if (!force) { @@ -899,8 +902,9 @@ chrono::time_point QMetaMysql::_getChunkMapUpdateTime(lock sql::SqlErrorObject errObj; sql::SqlResults results; string const tableName = "chunkMapStatus"; - string const query = - "SELECT TIME_TO_SEC(`update_time`) FROM `" + tableName + "` ORDER BY `update_time` DESC LIMIT 1"; + string const query = "SELECT UNIX_TIMESTAMP(`update_time`) FROM `" + tableName + + "` ORDER BY `update_time` DESC LIMIT 1"; + LOGS(_log, LOG_LVL_DEBUG, "Executing query: " << query); if (!_conn->runQuery(query, results, errObj)) { LOGS(_log, LOG_LVL_ERROR, "query failed: " << query); @@ -917,6 +921,7 @@ chrono::time_point QMetaMysql::_getChunkMapUpdateTime(lock throw ConsistencyError(ERR_LOC, "Too many rows in result set of query " + query); } try { + LOGS(_log, LOG_LVL_TRACE, "QMetaMysql::_getChunkMapUpdateTime " << updateTime[0]); return chrono::time_point() + chrono::seconds(stol(updateTime[0])); } catch (exception const& ex) { string const msg = "Failed to parse result set of query " + query + ", ex: " + string(ex.what()); diff --git a/src/util/Mutex.h b/src/util/Mutex.h index 1d6c0b046..991db6b18 100644 --- a/src/util/Mutex.h +++ b/src/util/Mutex.h @@ -32,7 +32,7 @@ #include "util/Bug.h" -#define USING_VMUTEX 1 // &&& Should be replaced by variable in build. +#define USING_VMUTEX 0 // &&& Should be replaced by variable in build. #ifdef MUTEX_UNITTEST #define USING_VMUTEX 1 @@ -50,7 +50,7 @@ #define VMUTEX_NOT_HELD(vmtx) \ if (vmtx.lockedByThread()) throw lsst::qserv::util::Bug(ERR_LOC, "mutex not unlocked!"); -#else // not USING_VMUTEX +#else // not USING_VMUTEX #define MUTEX std::mutex @@ -58,7 +58,7 @@ #define VMUTEX_NOT_HELD(vmtx) ; -#endif // USING_VMUTEX +#endif // USING_VMUTEX // This header declarations namespace lsst::qserv::util { diff --git a/src/util/QdispPool.cc b/src/util/QdispPool.cc new file mode 100644 index 000000000..02d2e1c41 --- /dev/null +++ b/src/util/QdispPool.cc @@ -0,0 +1,268 @@ +// -*- LSST-C++ -*- +/* + * LSST Data Management System + * Copyright 2018 AURA/LSST. + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "util/QdispPool.h" + +// LSST headers +#include "lsst/log/Log.h" + +// Qserv headers +#include "util/Bug.h" +#include "util/common.h" + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.util.QdispPool"); +} + +namespace lsst::qserv::util { + +///< @Return true if the queue could be added. +bool PriorityQueue::addPriQueue(int priority, int minRunning, int maxRunning) { + std::lock_guard lock(_mtx); + auto q = std::make_shared(priority, minRunning, maxRunning); + // std::pair item(priority, q); + auto item = std::make_pair(priority, q); + auto ret = _queues.insert(item); + if (!ret.second) { + LOGS(_log, LOG_LVL_ERROR, "Failed addPriQueue priority=" << priority << " minRunning=" << minRunning); + } + return ret.second; +} + +/// The pool needs to be able to place commands in this queue for shutdown. +void PriorityQueue::queCmd(util::Command::Ptr const& cmd) { + { + std::lock_guard lock(_mtx); + auto iter = _queues.find(_defaultPriority); + if (iter == _queues.end()) { + throw util::Bug(ERR_LOC, "PriorityQueue default priority queue not found a!"); + } + iter->second->queCmd(cmd); + _changed = true; + } + _cv.notify_one(); +} + +void PriorityQueue::queCmd(PriorityCommand::Ptr const& cmd, int priority) { + { + std::lock_guard lock(_mtx); + if (cmd->_queued.exchange(true) == true) { + throw util::Bug(ERR_LOC, + "PriorityQueue::queCmd cmd has already been queued and cannot be queued twice."); + } + auto iter = _queues.find(priority); + if (iter == _queues.end()) { + // give it the default priority + LOGS(_log, LOG_LVL_WARN, + "queCmd invalid priority=" << priority << " using default priority=" << _defaultPriority); + iter = _queues.find(_defaultPriority); + if (iter == _queues.end()) { + throw util::Bug(ERR_LOC, "PriorityQueue default priority queue not found b!"); + } + } + cmd->_priority = priority; + iter->second->queCmd(cmd); + LOGS(_log, LOG_LVL_DEBUG, "priQue p=" << priority << _statsStr()); + _changed = true; + } + _cv.notify_one(); +} + +std::atomic localLogLimiter(0); + +util::Command::Ptr PriorityQueue::getCmd(bool wait) { + util::Command::Ptr ptr; + std::unique_lock uLock(_mtx); + while (true) { + _changed = false; + ++localLogLimiter; + // Log this every once in while to INFO so there's some idea of system + // load without generating crushing amounts of log messages. + if (localLogLimiter % 500 == 0) { + LOGS(_log, LOG_LVL_INFO, "priQueGet " << _statsStr()); + } else { + LOGS(_log, LOG_LVL_DEBUG, "priQueGet " << _statsStr()); + } + + /// Make sure minimum number of jobs running per priority. + if (!_shuttingDown) { + // If shutting down, this could prevent all jobs from completing. + // Goes from highest to lowest priority queue + for (auto const& elem : _queues) { + PriQ::Ptr const& que = elem.second; + if (que->running < que->getMinRunning()) { + ptr = que->getCmd(false); // no wait + if (ptr != nullptr) { + return ptr; + } + } + } + } + + // Since all the minimums are met, just run the first command found. + for (auto const& elem : _queues) { + PriQ::Ptr const& que = elem.second; + // If this queue has no running threads, or + if (que->running < que->getMaxRunning()) { + ptr = que->getCmd(false); // no wait + if (ptr != nullptr) { + _changed = true; + _cv.notify_one(); + return ptr; + } + } + } + + // If nothing was found, wait or return nullptr. + if (wait) { + LOGS(_log, LOG_LVL_DEBUG, "getCmd wait " << _statsStr()); + _cv.wait(uLock, [this]() { return _changed; }); + } else { + return ptr; + } + } +} + +void PriorityQueue::prepareShutdown() { + std::lock_guard lock(_mtx); + _shuttingDown = true; +} + +void PriorityQueue::_incrDecrRunningCount(util::Command::Ptr const& cmd, int incrDecr) { + std::lock_guard lock(_mtx); + PriorityCommand::Ptr priCmd = std::dynamic_pointer_cast(cmd); + if (priCmd != nullptr) { + int priority = priCmd->_priority; + auto iter = _queues.find(priority); + if (iter != _queues.end()) { + iter->second->running += incrDecr; + return; + } + } else if (cmd != nullptr) { + // Non-PriorityCommands go on the default queue. + auto iter = _queues.find(_defaultPriority); + if (iter != _queues.end()) { + iter->second->running += incrDecr; + } + } + _cv.notify_one(); +} + +void PriorityQueue::commandStart(util::Command::Ptr const& cmd) { + // Increase running count by 1 + _incrDecrRunningCount(cmd, 1); +} + +void PriorityQueue::commandFinish(util::Command::Ptr const& cmd) { + // Reduce running count by 1 + _incrDecrRunningCount(cmd, -1); +} + +std::vector PriorityQueue::stats() const { + std::lock_guard const lock(_mtx); + return _stats(); +} + +std::vector PriorityQueue::_stats() const { + std::vector result; + for (auto const& elem : _queues) { + PriQ::Ptr const& queue = elem.second; + result.push_back(queue->stats()); + } + return result; +} + +std::string PriorityQueue::_statsStr() const { + std::stringstream os; + for (auto const& queueStats : _stats()) { + os << "(pr=" << queueStats.priority << ":sz=" << queueStats.size << ":r=" << queueStats.running + << ")"; + } + return os.str(); +} + +nlohmann::json PriorityQueue::getJson() const { + std::lock_guard const lock(_mtx); + nlohmann::json jsArray = nlohmann::json::array(); + for (auto const& queueStats : _stats()) { + nlohmann::json js; + js["priority"] = queueStats.priority; + js["size"] = queueStats.size; + js["running"] = queueStats.running; + jsArray.push_back(js); + } + return jsArray; +} + +QdispPool::QdispPool(int poolSize, int largestPriority, std::vector const& maxRunSizes, + std::vector const& minRunningSizes) { + std::stringstream os; + os << "poolSize(max " << maxPoolSize() << ")=" << poolSize << " maxPriority(1 to " + << defaultPriority() - 2 << ")=" << largestPriority + << " maxRunSizes=" << util::prettyCharList(maxRunSizes) + << " minRunningSizes=" << util::prettyCharList(minRunningSizes); + if (poolSize < 1 || poolSize > maxPoolSize() || largestPriority < 0 || + maxRunSizes.size() < static_cast(largestPriority) + 1 || + largestPriority > defaultPriority() - 2) { + LOGS(_log, LOG_LVL_ERROR, "QdispPool invalid paramater " << os.str()); + throw std::invalid_argument(os.str()); + } + + LOGS(_log, LOG_LVL_INFO, "QdispPool creating " << os.str()); + _prQueue = std::make_shared(defaultPriority(), 1, 1); // default (lowest) priority. + for (unsigned int pri = 0; pri <= static_cast(largestPriority); ++pri) { + size_t const minRun = minRunningSizes.size() > pri ? minRunningSizes[pri] : 1; + size_t const maxRun = maxRunSizes.size() > pri ? maxRunSizes[pri] : 1; + LOGS(_log, LOG_LVL_INFO, "creating priQ pri=" << pri << " min=" << minRun << " max=" << maxRun); + _prQueue->addPriQueue(pri, minRun, maxRun); + } + // This pool does not kick threads out when they take time (but little CPU) to process, + // so maxPoolThreads is just slightly larger than poolSize. + _pool = util::ThreadPool::newThreadPool(poolSize, _prQueue); +} + +QdispPool::QdispPool(bool unitTest) { + if (not unitTest) { + std::string msg( + "QdispPool::QdispPool(bool unitTest) " + "This constructor is only meant for use with unit tests."); + LOGS(_log, LOG_LVL_ERROR, + "QdispPool::QdispPool(bool unitTest) This constructor is only meant for use with unit tests."); + throw std::invalid_argument(msg); + } else { + _prQueue = std::make_shared(100, 1, 1); // default (lowest) priority. + unsigned int poolSize = 50; + _pool = util::ThreadPool::newThreadPool(poolSize, _prQueue); + _prQueue->addPriQueue(0, 1, 3); // Highest priority - interactive queries + _prQueue->addPriQueue(1, 1, 3); // Outgoing shared scan queries. + _prQueue->addPriQueue(2, 1, 3); // FAST queries (Object table) + _prQueue->addPriQueue(3, 1, 3); // MEDIUM queries (Source table) + _prQueue->addPriQueue(4, 1, 3); // SLOW queries (Object Extra table) + _prQueue->addPriQueue(5, 1, 3); // FAST large results + _prQueue->addPriQueue(6, 1, 3); // MEDIUM large results + _prQueue->addPriQueue(7, 1, 3); // Everything else (slow things) + } +} + +} // namespace lsst::qserv::util diff --git a/src/util/QdispPool.h b/src/util/QdispPool.h new file mode 100644 index 000000000..562450624 --- /dev/null +++ b/src/util/QdispPool.h @@ -0,0 +1,209 @@ +// -*- LSST-C++ -*- +/* + * LSST Data Management System + * Copyright 2018 LSST Corporation. + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_UTIL_QDISPPOOL_H +#define LSST_QSERV_UTIL_QDISPPOOL_H + +// System headers +#include +#include + +// Third-party headers +#include + +// Qserv headers +#include "util/ThreadPool.h" + +namespace lsst::qserv::util { + +class PriorityQueue; + +class PriorityCommand : public util::CommandTracked { +public: + using Ptr = std::shared_ptr; + PriorityCommand() = default; + explicit PriorityCommand(std::function func) : CommandTracked(func) {} + ~PriorityCommand() override = default; + friend PriorityQueue; + +private: + int _priority{0}; ///< Need to know what queue this was placed on. + /// Priority commands can only be queued once, or PriorityQueue acounting + /// can be contaminated: this flag causes But to be thrown if queued twice. + std::atomic _queued{false}; +}; + +/// FIFO priority queue. Elements with the same priority are handled in +/// a FIFO manner. Lower integer values are higher priority. +/// Higher priority queues get asked first when a thread becomes available +/// but the system reserves room so that each priority has at least +/// a minimum number of threads running. +class PriorityQueue : public util::CommandQueue { +public: + using Ptr = std::shared_ptr; + + /// A queue for handling all messages of a given priority. + class PriQ : public util::CommandQueue { + public: + using Ptr = std::shared_ptr; + + /// A snapshot status of the queue for logging or monitoring purposes. + struct Stats { + Stats(int priority_, size_t size_, int running_) + : priority(priority_), size(size_), running(running_) {} + int priority; + size_t size; + int running; + }; + + explicit PriQ(int priority, int minRunning, int maxRunning) + : _priority(priority), _minRunning(minRunning), _maxRunning(maxRunning) {} + ~PriQ() override = default; + int getPriority() const { return _priority; } + int getMinRunning() const { return _minRunning; } + int getMaxRunning() const { return _maxRunning; } + + Stats stats() const { return Stats(_priority, const_cast(this)->size(), running); } + + std::atomic running{0}; ///< number of jobs of this priority currently running. + private: + int const _priority; ///< priority value of this queue + int const _minRunning; ///< minimum number of threads (unless nothing on this queue to run) + int const _maxRunning; ///< maximum number of threads for this PriQ to use. + }; + + PriorityQueue() = delete; + PriorityQueue(PriorityQueue const&) = delete; + PriorityQueue& operator=(PriorityQueue const&) = delete; + + PriorityQueue(int defaultPriority, int minRunning, int maxRunning) : _defaultPriority(defaultPriority) { + _queues[_defaultPriority] = std::make_shared(_defaultPriority, minRunning, maxRunning); + } + + ///< @Return true if the queue could be added. + bool addPriQueue(int priority, int minRunning, int spareThreads); + + /// The pool needs to be able to place commands in this queue for shutdown. + void queCmd(util::Command::Ptr const& cmd) override; + + void queCmd(PriorityCommand::Ptr const& cmd, int priority); + + util::Command::Ptr getCmd(bool wait = true) override; + void prepareShutdown(); + + void commandStart(util::Command::Ptr const& cmd) override; + void commandFinish(util::Command::Ptr const& cmd) override; + + /// @return a snapshot of statistics for all queues (one element per queue) + std::vector stats() const; + + /// @return a json object with queue information. + nlohmann::json getJson() const; + +private: + /// @note a lock on _mtx must be held before calling the method + /// @return a snapshot of statistics for all queues (one element per queue) + std::vector _stats() const; + + /// @note a lock on _mtx must be held before calling the method + /// @return the stringified representation of the statistics for all queues + std::string _statsStr() const; + + void _incrDecrRunningCount(util::Command::Ptr const& cmd, int incrDecr); + + mutable std::mutex _mtx; + std::condition_variable _cv; + bool _shuttingDown{false}; + bool _changed{false}; + + std::map _queues; + int _defaultPriority{1}; +}; + +/// This class is used to provide a pool of threads for handling out going +/// and incoming messages from xrootd as well as a system for prioritizing +/// the messages. +/// This has not worked entirely as intended. Reducing the number of threads +/// had negative impacts on xrootd, but other changes have been made such that +/// reducing the size of the thread pools can be tried again. +/// What it does do is prioritize outgoing messages (typically jobs going to +/// workers), allow interactive queries to be handled quickly, even under +/// substantial loads, and it gives a good idea of how busy the czar really +/// is. Large numbers of queued items in any of the scan queries, or large +/// results would be good indicators to avoid giving a particular czar more +/// user queries. +/// +class QdispPool { +public: + typedef std::shared_ptr Ptr; + + /// Default priority, the lowest possible priority. + static int defaultPriority() { return 100; } + /// This should be more than enough. + static int maxPoolSize() { return 20000; } + + /// poolSize - total number of threads in the pool + /// largestPriority - highest priority is 0, lowest possible priority is + /// 100 and is reserved for default priority. largestPriority=4 would + /// result in PriorityQueues's being created for + /// priorities 0, 1, 2, 3, 4, and 100. Priority 100 is + /// meant for changing aspects of the pool and shutdown. + /// runSizes - Each entry represents the maximum number of concurrent running + /// commands for a priority given by the position in the array. + /// If a position is undefined, the default value is 1. + /// ex. 5, 10, 10, 3, 3 would apply to the priorities above as + /// priority 0 can have up to 5 concurrent running commands + /// priorities 1 and 2 can have up to 10 + /// priorities 3 and 4 can have up to 3 + /// minRunningSizes - Each entry represents the minimum number of threads + /// to be running (defaults to 0). Non-zero values can keep + /// lower priorities from being completely starved and/or + /// reduce deadlocks from high priorities depending on lower + /// priorities. + QdispPool(int poolSize, int largestPriority, std::vector const& maxRunSizes, + std::vector const& minRunningSizes); + QdispPool() = delete; + explicit QdispPool(bool unitTest); + QdispPool(QdispPool const&) = delete; + QdispPool& operator=(QdispPool const&) = delete; + + /// Lower priority numbers are higher priority. + /// Invalid priorities get the lowest priority (high priority number). + void queCmd(PriorityCommand::Ptr const& cmd, int priority) { _prQueue->queCmd(cmd, priority); } + + /// Commands on queue's with priority lower than default may not be run. + void shutdownPool() { + _prQueue->prepareShutdown(); + _pool->shutdownPool(); + } + + /// @return a json object with queue information. + nlohmann::json getJson() const { return _prQueue->getJson(); } + +private: + PriorityQueue::Ptr _prQueue; + util::ThreadPool::Ptr _pool; +}; + +} // namespace lsst::qserv::util + +#endif /* LSST_QSERV_UTIL_QDISPPOOL_H_ */ diff --git a/src/util/testMutex.cc b/src/util/testMutex.cc index 6d22be4e7..e1da95c9d 100644 --- a/src/util/testMutex.cc +++ b/src/util/testMutex.cc @@ -60,7 +60,7 @@ BOOST_AUTO_TEST_SUITE(Suite) BOOST_AUTO_TEST_CASE(MutexTest) { // Test the interface of class Mutex to comply with expectations // of the standard std::lock_guard. - LOGS_DEBUG("MutexTest begins"); + LOGS_INFO("MutexTest begins"); // The mutex won't be locked by anyone Mutex mtx1; @@ -128,13 +128,13 @@ BOOST_AUTO_TEST_CASE(MutexTest) { } BOOST_CHECK_EQUAL(counter, steps * numThreads); } - LOGS_DEBUG("MutexTest ends"); + LOGS_INFO("MutexTest ends"); } BOOST_AUTO_TEST_CASE(VMutexTest) { // Test the interface of class Mutex to comply with expectations // of the standard std::lock_guard. - LOGS_DEBUG("VMutexTest begins"); + LOGS_INFO("VMutexTest begins"); // The mutex won't be locked by anyone VMutex mtx1; @@ -207,12 +207,12 @@ BOOST_AUTO_TEST_CASE(VMutexTest) { BOOST_CHECK_EQUAL(counter, steps * numThreads); } - LOGS_DEBUG("VMutexTest ends"); + LOGS_INFO("VMutexTest ends"); } BOOST_AUTO_TEST_CASE(LockTest1) { // Test locking a mutex created on stack using a special class util::Lock. - LOGS_DEBUG("LockTest1 begins"); + LOGS_INFO("LockTest1 begins"); // The mutex won't be locked by anyone Mutex mtx1; @@ -226,7 +226,7 @@ BOOST_AUTO_TEST_CASE(LockTest1) { Lock const lock(mtx2, "LockTes1t: main thread"); BOOST_CHECK(mtx2.lockedByThread()); } - LOGS_DEBUG(!mtx2.lockedByThread()); + LOGS_INFO(!mtx2.lockedByThread()); // Lock this mutex in each of two separate threads. Let each thread // to wait for a random period of time within some interval before @@ -274,7 +274,7 @@ BOOST_AUTO_TEST_CASE(LockTest1) { } BOOST_CHECK_EQUAL(counter, steps * numThreads); } - LOGS_DEBUG("LockTest1 ends"); + LOGS_INFO("LockTest1 ends"); } BOOST_AUTO_TEST_CASE(LockTest2) { @@ -282,7 +282,7 @@ BOOST_AUTO_TEST_CASE(LockTest2) { // a shared pointer using a special class util::Lock. The test implements // the same testing algorithm as the previous test, except it will be testing // a different way of constructing the lock. - LOGS_DEBUG("LockTest2 begins"); + LOGS_INFO("LockTest2 begins"); // The mutex won't be locked by anyone shared_ptr const mtx1 = make_shared(); @@ -344,7 +344,7 @@ BOOST_AUTO_TEST_CASE(LockTest2) { } BOOST_CHECK_EQUAL(counter, steps * numThreads); } - LOGS_DEBUG("LockTest2 ends"); + LOGS_INFO("LockTest2 ends"); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index 08551bf75..a70793f2a 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -209,7 +209,6 @@ void UJTransmitCmd::action(util::CmdData* data) { try { json const response = client.readAsJson(); if (0 != response.at("success").get()) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& success url=" << _url); transmitSuccess = true; } else { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); diff --git a/src/wconfig/WorkerConfig.h b/src/wconfig/WorkerConfig.h index 584aa3209..f82750971 100644 --- a/src/wconfig/WorkerConfig.h +++ b/src/wconfig/WorkerConfig.h @@ -221,6 +221,29 @@ class WorkerConfig { /// @return the port number of the worker XROOTD service for serving result files uint16_t resultsXrootdPort() const { return _resultsXrootdPort->getVal(); } + /// The size + int getQPoolSize() const { return _qPoolSize->getVal(); } + + /// The highest priority number, such as 2, which results + /// in queues for priorities 0, 1, 2, and 100; where 0 is the + /// highest priority. + /// @see util::QdispPool + int getQPoolMaxPriority() const { return _qPoolMaxPriority->getVal(); } + + /// The maximum number of running threads at each priority, + /// "30:20:20:10" with _qPoolMaxPriority=2 allows 30 threads + /// at priority 0, 20 threads at priorities 1+2, and 10 threads + /// at priority 100. + /// @see util::QdispPool + std::string getQPoolRunSizes() const { return _qPoolRunSizes->getVal(); } + + /// The minimum number of running threads per priority, + /// "3:3:3:3" with _qPoolMaxPriority=2 means that a thread at priority + /// 0 would not start if it meant that there would not be enough threads + /// left to have running for each of priorities 1, 2, and 100. + /// @see util::QdispPool + std::string getQPoolMinRunningSizes() const { return _qPoolMinRunningSizes->getVal(); } + /// @return the number of the BOOST ASIO threads for servicing HTGTP requests size_t resultsNumHttpThreads() const { return _resultsNumHttpThreads->getVal(); } @@ -399,6 +422,14 @@ class WorkerConfig { CVTStrPtr _mysqlHostname = util::ConfigValTStr::create(_configValMap, "mysql", "hostname", required, "none"); CVTStrPtr _mysqlDb = util::ConfigValTStr::create(_configValMap, "mysql", "db", notReq, ""); + + CVTIntPtr _qPoolSize = util::ConfigValTInt::create(_configValMap, "qpool", "Size", notReq, 50); + CVTIntPtr _qPoolMaxPriority = + util::ConfigValTInt::create(_configValMap, "qpool", "MaxPriority", notReq, 2); + CVTStrPtr _qPoolRunSizes = + util::ConfigValTStr::create(_configValMap, "qpool", "RunSizes", notReq, "30:20:20:10"); + CVTStrPtr _qPoolMinRunningSizes = + util::ConfigValTStr::create(_configValMap, "qpool", "MinRunningSizes", notReq, "3:3:3:3"); }; } // namespace lsst::qserv::wconfig diff --git a/src/wcontrol/Foreman.cc b/src/wcontrol/Foreman.cc index db4d7626f..b96dbed58 100644 --- a/src/wcontrol/Foreman.cc +++ b/src/wcontrol/Foreman.cc @@ -87,7 +87,9 @@ Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolS unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, wpublish::QueriesAndChunks::Ptr const& queries, std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr) { + std::shared_ptr const& sqlConnMgr, int qPoolSize, + int maxPriority, std::string const& vectRunSizesStr, + std::string const& vectMinRunningSizesStr) { // Latch static std::atomic globalForemanSet{false}; if (globalForemanSet.exchange(true) == true) { @@ -95,7 +97,7 @@ Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolS } Ptr fm = Ptr(new Foreman(scheduler, poolSize, maxPoolThreads, mySqlConfig, queries, chunkInventory, - sqlConnMgr)); + sqlConnMgr, qPoolSize, maxPriority, vectRunSizesStr, vectMinRunningSizesStr)); _globalForeman = fm; return _globalForeman; } @@ -103,7 +105,8 @@ Foreman::Ptr Foreman::create(Scheduler::Ptr const& scheduler, unsigned int poolS Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, wpublish::QueriesAndChunks::Ptr const& queries, std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr) + std::shared_ptr const& sqlConnMgr, int qPoolSize, int maxPriority, + std::string const& vectRunSizesStr, std::string const& vectMinRunningSizesStr) : _scheduler(scheduler), _mySqlConfig(mySqlConfig), _queries(queries), @@ -132,11 +135,7 @@ Foreman::Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigne _mark = make_shared(ERR_LOC, "Forman Test Msg"); - int qPoolSize = 50; // &&& TODO:UJ put in config - int maxPriority = 2; // &&& TODO:UJ put in config - string vectRunSizesStr = "10:10:10:10"; // &&& TODO:UJ put in config vector vectRunSizes = util::String::parseToVectInt(vectRunSizesStr, ":", 1); - string vectMinRunningSizesStr = "0:1:3:3"; // &&& TODO:UJ put in config vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); LOGS(_log, LOG_LVL_INFO, "INFO wPool config qPoolSize=" << qPoolSize << " maxPriority=" << maxPriority << " vectRunSizes=" diff --git a/src/wcontrol/Foreman.h b/src/wcontrol/Foreman.h index d00eed2a6..ed2f78518 100644 --- a/src/wcontrol/Foreman.h +++ b/src/wcontrol/Foreman.h @@ -115,7 +115,8 @@ class Foreman { mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& queries, std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr); + std::shared_ptr const& sqlConnMgr, int qPoolSize, int maxPriority, + std::string const& vectRunSizesStr, std::string const& vectMinRunningSizesStr); ~Foreman(); @@ -151,7 +152,8 @@ class Foreman { Foreman(Scheduler::Ptr const& scheduler, unsigned int poolSize, unsigned int maxPoolThreads, mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& queries, std::shared_ptr const& chunkInventory, - std::shared_ptr const& sqlConnMgr); + std::shared_ptr const& sqlConnMgr, int qPoolSize, int maxPriority, + std::string const& vectRunSizesStr, std::string const& vectMinRunningSizesStr); /// Startup time of worker, sent to czars so they can detect that the worker was /// was restarted when this value changes. @@ -185,6 +187,11 @@ class Foreman { std::shared_ptr const _httpServer; /// Combined priority queue and thread pool for communicating with czars. + /// TODO:UJ - It would be better to have a pool for each czar as it + /// may be possible for a czar to have communications + /// problems in a way that would wedge the pool. This can + /// probably be done fairly easily by having pools + /// attached to wcontrol::WCzarInfoMap. std::shared_ptr _wPool; /// Map of czar information for all czars that have contacted this worker. diff --git a/src/wcontrol/WCzarInfoMap.cc b/src/wcontrol/WCzarInfoMap.cc new file mode 100644 index 000000000..4e7aa1196 --- /dev/null +++ b/src/wcontrol/WCzarInfoMap.cc @@ -0,0 +1,186 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "wcontrol/WCzarInfoMap.h" + +#include +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// qserv headers +#include "http/Client.h" +#include "http/WorkerQueryStatusData.h" +#include "util/Bug.h" +#include "util/Histogram.h" +#include "wbase/UberJobData.h" +#include "wconfig/WorkerConfig.h" +#include "wcontrol/Foreman.h" +#include "wpublish/QueriesAndChunks.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; + +using namespace std::chrono_literals; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.wcontrol.WCzarInfoMap"); +} + +namespace lsst::qserv::wcontrol { + +WCzarInfo::WCzarInfo(CzarIdType czarId_) + : czarId(czarId_), + _workerCzarComIssue(http::WorkerCzarComIssue::create( + wconfig::WorkerConfig::instance()->replicationInstanceId(), + wconfig::WorkerConfig::instance()->replicationAuthKey())) {} + +void WCzarInfo::czarMsgReceived(TIMEPOINT tm) { + unique_lock uniLock(_wciMtx); + _lastTouch = tm; + if (_alive.exchange(true) == false) { + uniLock.unlock(); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " was dead and is now alive"); + _workerCzarComIssue->setThoughtCzarWasDead(true); + } +} + +void WCzarInfo::sendWorkerCzarComIssueIfNeeded(http::WorkerContactInfo::Ptr const& wInfo_, + http::CzarContactInfo::Ptr const& czInfo_) { + unique_lock uniLock(_wciMtx); + if (_workerCzarComIssue->needToSend()) { + // Having more than one of this message being sent at one time + // could cause race issues and it would be a problem if it was + // stuck in a queue, so it gets its own thread. + if (_msgThreadRunning.exchange(true) == true) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " message thread already running"); + return; + } + _workerCzarComIssue->setContactInfo(wInfo_, czInfo_); + auto selfPtr = weak_from_this(); + auto thrdFunc = [selfPtr]() { + auto sPtr = selfPtr.lock(); + if (sPtr == nullptr) { + LOGS(_log, LOG_LVL_WARN, "WCzarInfo::sendWorkerCzarComIssueIfNeeded thrdFunc sPtr was null"); + } + sPtr->_sendMessage(); + }; + + thread thrd(thrdFunc); + thrd.detach(); + } +} + +void WCzarInfo::_sendMessage() { + // Make certain _msgThreadRunning is set to false when this function ends. + class ClearMsgThreadRunning { + public: + ClearMsgThreadRunning(WCzarInfo* wcInfo) : _wcInfo(wcInfo) {} + ~ClearMsgThreadRunning() { _wcInfo->_msgThreadRunning = false; } + WCzarInfo* const _wcInfo; + }; + ClearMsgThreadRunning clearMsgThreadRunning(this); + + auto const method = http::Method::POST; + + unique_lock uniLock(_wciMtx); + auto czInfo = _workerCzarComIssue->getCzarInfo(); + // If thoughtCzarWasDead is set now, it needs to be cleared on successful reception from czar. + bool needToClearThoughtCzarWasDead = _workerCzarComIssue->getThoughtCzarWasDead(); + if (czInfo == nullptr) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " czar info was null"); + return; + } + vector const headers = {"Content-Type: application/json"}; + string const url = + "http://" + czInfo->czHostName + ":" + to_string(czInfo->czPort) + "/workerczarcomissue"; + auto jsReqPtr = _workerCzarComIssue->serializeJson(); + uniLock.unlock(); // Must unlock before communication + + auto requestStr = jsReqPtr->dump(); + http::Client client(method, url, requestStr, headers); + bool transmitSuccess = false; + try { + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " read start"); + nlohmann::json const response = client.readAsJson(); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " read end"); + uniLock.lock(); + if (0 != response.at("success").get()) { + transmitSuccess = true; + if (needToClearThoughtCzarWasDead) { + _workerCzarComIssue->setThoughtCzarWasDead(false); + } + } else { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " Transmit success == 0"); + // There's no point in re-sending as the czar got the message and didn't like + // it. + // TODO:UJ &&& maybe add this czId+ujId to a list of failed uberjobs that can be put + // TODO:UJ &&& status return??? Probably overkill. + } + } catch (exception const& ex) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) + " " + requestStr + " failed, ex: " + ex.what()); + } + + if (!transmitSuccess) { + // If this fails, wait for + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " failed to send message"); + } +} + +bool WCzarInfo::checkAlive(TIMEPOINT tmMark) { + lock_guard lg(_wciMtx); + if (_alive) { + auto timeSinceContact = tmMark - _lastTouch; + if (timeSinceContact >= 120s) { // TODO:UJ get _deadTime from config &&& + // Contact with the czar has timed out. + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " czar timeout"); + _alive = false; + // Kill all queries from this czar + auto fMan = Foreman::getForeman(); + if (fMan != nullptr) { + auto queriesAndChunks = fMan->getQueriesAndChunks(); + if (queriesAndChunks != nullptr) { + queriesAndChunks->killAllQueriesFromCzar(czarId); + } + } + } + } + return _alive; +} + +WCzarInfo::Ptr WCzarInfoMap::getWCzarInfo(CzarIdType czId) { + std::lock_guard lg(_wczMapMtx); + auto iter = _wczMap.find(czId); + if (iter == _wczMap.end()) { + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " new czar contacted " << czId); + auto const newCzInfo = WCzarInfo::create(czId); + _wczMap[czId] = newCzInfo; + return newCzInfo; + } + return iter->second; +} + +} // namespace lsst::qserv::wcontrol diff --git a/src/wcontrol/WCzarInfoMap.h b/src/wcontrol/WCzarInfoMap.h new file mode 100644 index 000000000..11703350a --- /dev/null +++ b/src/wcontrol/WCzarInfoMap.h @@ -0,0 +1,129 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_WCONTROL_WCZARINFOMAP_H +#define LSST_QSERV_WCONTROL_WCZARINFOMAP_H + +// System headers +#include +#include +#include +#include + +// Third-party headers + +// Qserv headers +#include "global/clock_defs.h" +#include "global/intTypes.h" + +namespace lsst::qserv::http { +class CzarContactInfo; +class WorkerContactInfo; +class WorkerCzarComIssue; +} // namespace lsst::qserv::http + +namespace lsst::qserv::wbase { +class UJTransmitCmd; +} + +namespace lsst::qserv::wcontrol { + +class Foreman; + +/// This class is used to send the "/workerczarcomissue" from the worker to the +/// czar and then used by the czar to handle the message; the messsage itself +/// is made with WorkerCzarComIssue. +/// The general concept is that WorkerCzarComIssue exists on both the worker +/// and the czar and messages keep them in sync. +/// This class is assuming the czardId is correct and there are no duplicate czarIds. +class WCzarInfo : public std::enable_shared_from_this { +public: + using Ptr = std::shared_ptr; + + std::string cName(const char* funcN) { + return std::string("WCzarInfo::") + funcN + " czId=" + std::to_string(czarId); + } + + WCzarInfo() = delete; + ~WCzarInfo() = default; + + static Ptr create(CzarIdType czarId_) { return Ptr(new WCzarInfo(czarId_)); } + + /// If there were communication issues, start a thread to send the WorkerCzarComIssue message. + void sendWorkerCzarComIssueIfNeeded(std::shared_ptr const& wInfo_, + std::shared_ptr const& czInfo_); + + /// Called by the worker after the czar successfully replied to the original + /// message from the worker. + void czarMsgReceived(TIMEPOINT tm); + + bool isAlive() const { return _alive; } + + /// Check if the czar is still considered to be alive, or it timed out. + bool checkAlive(TIMEPOINT tmMark); + + std::shared_ptr getWorkerCzarComIssue(); + + CzarIdType const czarId; + +private: + WCzarInfo(CzarIdType czarId_); + + void _sendMessage(); + + std::atomic _alive{true}; + TIMEPOINT _lastTouch{CLOCK::now()}; + + /// This class tracks communication problems and prepares a message + /// to inform the czar of the problem. + std::shared_ptr _workerCzarComIssue; + mutable std::mutex _wciMtx; ///< protects all private members. + + /// true when running a thread to send a message to the czar + /// with _sendMessage() + std::atomic _msgThreadRunning{false}; +}; + +/// Each worker talks to multiple czars and needs a WCzarInfo object for each czar, +/// this class keeps track of those objects. +class WCzarInfoMap { +public: + using Ptr = std::shared_ptr; + + std::string cName(const char* funcN) { return std::string("WCzarInfoMap::") + funcN; } + + ~WCzarInfoMap() = default; + + static Ptr create() { return Ptr(new WCzarInfoMap()); } + + /// Return the WCzarInfo ptr associated with czId, creating a new one if needed. + WCzarInfo::Ptr getWCzarInfo(CzarIdType czId); + +private: + WCzarInfoMap() = default; + + std::map _wczMap; + + mutable std::mutex _wczMapMtx; +}; + +} // namespace lsst::qserv::wcontrol + +#endif // LSST_QSERV_WCONTROL_WCZARINFOMAP_H diff --git a/src/wcontrol/WorkerStats.h b/src/wcontrol/WorkerStats.h index 0acdaa2b3..afcde1ed9 100644 --- a/src/wcontrol/WorkerStats.h +++ b/src/wcontrol/WorkerStats.h @@ -77,7 +77,7 @@ class WorkerStats : std::enable_shared_from_this { private: WorkerStats(); static Ptr _globalWorkerStats; ///< Pointer to the global instance. - static MUTEX _globalMtx; ///< Protects `_globalWorkerStats` + static MUTEX _globalMtx; ///< Protects `_globalWorkerStats` std::atomic _queueCount{ 0}; ///< Number of buffers on queues (there are many queues, one per ChannelShared) diff --git a/src/wdb/CMakeLists.txt b/src/wdb/CMakeLists.txt index 6120078f8..c363a4e24 100644 --- a/src/wdb/CMakeLists.txt +++ b/src/wdb/CMakeLists.txt @@ -37,4 +37,6 @@ wdb_tests( testQueryRunner ) +# For this test to work, a mariadb server needs to be available. +# This functionality is covered by integration tests. set_tests_properties(testQueryRunner PROPERTIES WILL_FAIL 1) diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index 927109647..c4b703a92 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -54,12 +54,6 @@ namespace util = lsst::qserv::util; using lsst::qserv::mysql::MySqlConfig; using lsst::qserv::mysql::MySqlConnection; -/* &&& -using lsst::qserv::proto::TaskMsg; -using lsst::qserv::proto::TaskMsg_Fragment; -using lsst::qserv::proto::TaskMsg_Subchunk; -*/ - using lsst::qserv::wbase::FileChannelShared; using lsst::qserv::wbase::SendChannel; using lsst::qserv::wbase::Task; @@ -72,22 +66,6 @@ using lsst::qserv::wdb::QueryRunner; using lsst::qserv::wpublish::QueriesAndChunks; struct Fixture { - /* &&& - shared_ptr newTaskMsg() { - shared_ptr t = make_shared(); - t->set_chunkid(3240); // hardcoded - t->set_db("LSST"); // hardcoded - auto scanTbl = t->add_scantable(); - scanTbl->set_db("LSST"); - scanTbl->set_table("Object"); - scanTbl->set_lockinmemory(false); - scanTbl->set_scanrating(1); - lsst::qserv::proto::TaskMsg::Fragment* f = t->add_fragment(); - f->add_query("SELECT AVG(yFlux_PS) from LSST.Object_3240"); - return t; - } - */ - struct MsgInfo { string const db = "LSST"; string const table = "Object"; @@ -98,7 +76,6 @@ struct Fixture { string const czarHostName = "cz5host"; int const czarPort = 3437; string const targWorkerId = "a_worker"; - // &&& make mock foreman instead of nullptr? std::shared_ptr foreman; int const queryId = 23; int const jobId = 1; @@ -132,15 +109,6 @@ struct Fixture { auto& jsJobMsg = *jsJobMsgPtr; auto& chunkScanTables = jsJobMsg["chunkScanTables"]; - /* &&& - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { - nlohmann::json cst = {{"db", sTbl.db}, - {"table", sTbl.table}, - {"lockInMemory", sTbl.lockInMemory}, - {"tblScanRating", sTbl.scanRating}}; - chunkScanTables.push_back(move(cst)); - } - */ nlohmann::json cst = {{"db", mInfo.db}, {"table", mInfo.table}, {"lockInMemory", mInfo.lockInMemory}, @@ -148,32 +116,6 @@ struct Fixture { chunkScanTables.push_back(move(cst)); auto& jsFragments = jsJobMsg["queryFragments"]; - /* &&& - if (chunkQuerySpec.nextFragment.get()) { - ChunkQuerySpec const* sPtr = &chunkQuerySpec; - while (sPtr) { - LOGS(_log, LOG_LVL_TRACE, "nextFragment"); - for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { - LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); - } - for (auto const& sbi : sPtr->subChunkIds) { - LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); - } - // Linked fragments will not have valid subChunkTables vectors, - // So, we reuse the root fragment's vector. - _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, - sPtr->queries); - sPtr = sPtr->nextFragment.get(); - } - } else { - LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); - } - _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, - chunkQuerySpec.subChunkIds, chunkQuerySpec.queries); - } - */ nlohmann::json jsFrag = {{"resultTable", mInfo.resultName}, {"queries", nlohmann::json::array()}, {"subchunkTables", nlohmann::json::array()}, @@ -207,25 +149,9 @@ struct Fixture { } }; -BOOST_FIXTURE_TEST_SUITE(Basic, Fixture) +BOOST_FIXTURE_TEST_SUITE(Basic, Fixture, *boost::unit_test::timeout(20)) BOOST_AUTO_TEST_CASE(Simple) { - /* &&& - WorkerConfig::create(); - shared_ptr msg(newTaskMsg()); - shared_ptr sendC(SendChannel::newNopChannel()); - auto sc = FileChannelShared::create(sendC, msg->czarid()); - FakeBackend::Ptr backend = make_shared(); - shared_ptr crm = ChunkResourceMgr::newMgr(backend); - SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); - auto const queries = queriesAndChunks(); - //&&& auto taskVect = Task::createTasks(msg, sc, crm, newMySqlConfig(), sqlConnMgr, queries); - auto taskVect = Task::createTasks(msg, sc, crm, newMySqlConfig(), sqlConnMgr, queries); - Task::Ptr task = taskVect[0]; - QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); - BOOST_CHECK(a->runQuery()); - */ - WorkerConfig::create(); MsgInfo mInfo; auto msgJson = newTaskJson(mInfo); @@ -233,7 +159,7 @@ BOOST_AUTO_TEST_CASE(Simple) { auto sChannel = FileChannelShared::create(sendC, mInfo.czarId); FakeBackend::Ptr backend = make_shared(); shared_ptr crm = ChunkResourceMgr::newMgr(backend); - SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); + SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 9); auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, @@ -258,7 +184,7 @@ BOOST_AUTO_TEST_CASE(Output) { auto sc = FileChannelShared::create(sendC, mInfo.czarId); FakeBackend::Ptr backend = make_shared(); shared_ptr crm = ChunkResourceMgr::newMgr(backend); - SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 15); + SqlConnMgr::Ptr sqlConnMgr = make_shared(20, 9); auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index 4ae7d6e76..d2d1507a7 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -236,8 +236,14 @@ SsiService::SsiService(XrdSsiLogger* log) { LOGS(_log, LOG_LVL_WARN, "config sqlConnMgr" << *sqlConnMgr); LOGS(_log, LOG_LVL_WARN, "maxPoolThreads=" << maxPoolThreads); + int qPoolSize = workerConfig->getQPoolSize(); + int maxPriority = workerConfig->getQPoolMaxPriority(); + string vectRunSizesStr = workerConfig->getQPoolRunSizes(); + string vectMinRunningSizesStr = workerConfig->getQPoolMinRunningSizes(); + _foreman = wcontrol::Foreman::create(blendSched, poolSize, maxPoolThreads, mySqlConfig, queries, - ::makeChunkInventory(mySqlConfig), sqlConnMgr); + ::makeChunkInventory(mySqlConfig), sqlConnMgr, qPoolSize, + maxPriority, vectRunSizesStr, vectMinRunningSizesStr); // Watch to see if the log configuration is changed. // If LSST_LOG_CONFIG is not defined, there's no good way to know what log From fa16b4fa1e2a5ed0eb5de72e7646507da5366e24 Mon Sep 17 00:00:00 2001 From: John Gates Date: Mon, 18 Nov 2024 12:40:53 -0800 Subject: [PATCH 14/22] Created protojson namespace. --- src/CMakeLists.txt | 2 + src/czar/ActiveWorker.cc | 14 +- src/czar/ActiveWorker.h | 31 +++-- src/czar/CMakeLists.txt | 1 + src/czar/CzarRegistry.cc | 22 +-- src/czar/CzarRegistry.h | 10 +- src/czar/HttpCzarWorkerModule.cc | 8 +- src/http/CMakeLists.txt | 4 +- src/protojson/CMakeLists.txt | 37 ++++++ .../WorkerQueryStatusData.cc | 48 +++---- .../WorkerQueryStatusData.h | 14 +- src/{http => protojson}/testStatusData.cc | 37 +++--- src/qdisp/UberJob.h | 6 +- src/wbase/Task.cc | 125 ++++++++++++++++++ src/wbase/Task.h | 12 ++ src/wcontrol/WCzarInfoMap.cc | 8 +- src/wcontrol/WCzarInfoMap.h | 12 +- src/wsched/testSchedulers.cc | 2 +- src/xrdsvc/HttpWorkerCzarModule.cc | 6 +- 19 files changed, 296 insertions(+), 103 deletions(-) create mode 100644 src/protojson/CMakeLists.txt rename src/{http => protojson}/WorkerQueryStatusData.cc (90%) rename src/{http => protojson}/WorkerQueryStatusData.h (98%) rename src/{http => protojson}/testStatusData.cc (83%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f5d8a98ee..9fb37440e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -68,6 +68,7 @@ add_subdirectory(mysql) add_subdirectory(parser) add_subdirectory(partition) add_subdirectory(proto) +add_subdirectory(protojson) add_subdirectory(proxy) add_subdirectory(qana) add_subdirectory(qdisp) @@ -102,6 +103,7 @@ target_link_libraries(qserv_common PUBLIC mysql sql util + protojson ) install( diff --git a/src/czar/ActiveWorker.cc b/src/czar/ActiveWorker.cc index e2e356fdd..68f7be092 100644 --- a/src/czar/ActiveWorker.cc +++ b/src/czar/ActiveWorker.cc @@ -57,14 +57,14 @@ string ActiveWorker::getStateStr(State st) { return string("unknown"); } -bool ActiveWorker::compareContactInfo(http::WorkerContactInfo const& wcInfo) const { +bool ActiveWorker::compareContactInfo(protojson::WorkerContactInfo const& wcInfo) const { lock_guard lg(_aMtx); auto wInfo_ = _wqsData->getWInfo(); if (wInfo_ == nullptr) return false; return wInfo_->isSameContactInfo(wcInfo); } -void ActiveWorker::setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo) { +void ActiveWorker::setWorkerContactInfo(protojson::WorkerContactInfo::Ptr const& wcInfo) { LOGS(_log, LOG_LVL_INFO, cName(__func__) << " new info=" << wcInfo->dump()); lock_guard lg(_aMtx); _wqsData->setWInfo(wcInfo); @@ -82,7 +82,7 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti double maxLifetime) { LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " start"); bool newlyDeadWorker = false; - http::WorkerContactInfo::Ptr wInfo_; + protojson::WorkerContactInfo::Ptr wInfo_; { lock_guard lg(_aMtx); wInfo_ = _wqsData->getWInfo(); @@ -155,7 +155,7 @@ void ActiveWorker::updateStateAndSendMessages(double timeoutAliveSecs, double ti qdisppool->queCmd(cmd, 1); } -void ActiveWorker::_sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, +void ActiveWorker::_sendStatusMsg(protojson::WorkerContactInfo::Ptr const& wInf, std::shared_ptr const& jsWorkerReqPtr) { auto& jsWorkerReq = *jsWorkerReqPtr; auto const method = http::Method::POST; @@ -214,7 +214,7 @@ void ActiveWorker::addDeadUberJob(QueryId qId, UberJobId ujId) { _wqsData->addDeadUberJob(qId, ujId, now); } -http::WorkerContactInfo::Ptr ActiveWorker::getWInfo() const { +protojson::WorkerContactInfo::Ptr ActiveWorker::getWInfo() const { std::lock_guard lg(_aMtx); if (_wqsData == nullptr) return nullptr; return _wqsData->getWInfo(); @@ -241,8 +241,8 @@ ActiveWorkerMap::ActiveWorkerMap(std::shared_ptr const& cza _timeoutDeadSecs(czarConfig->getActiveWorkerTimeoutDeadSecs()), _maxLifetime(czarConfig->getActiveWorkerMaxLifetimeSecs()) {} -void ActiveWorkerMap::updateMap(http::WorkerContactInfo::WCMap const& wcMap, - http::CzarContactInfo::Ptr const& czInfo, +void ActiveWorkerMap::updateMap(protojson::WorkerContactInfo::WCMap const& wcMap, + protojson::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, std::string const& replicationAuthKey) { // Go through wcMap, update existing entries in _awMap, create new entries for those that don't exist, diff --git a/src/czar/ActiveWorker.h b/src/czar/ActiveWorker.h index 3c4c16c59..d462f0d0e 100644 --- a/src/czar/ActiveWorker.h +++ b/src/czar/ActiveWorker.h @@ -32,7 +32,7 @@ #include "nlohmann/json.hpp" // qserv headers -#include "http/WorkerQueryStatusData.h" +#include "protojson/WorkerQueryStatusData.h" #include "util/Bug.h" namespace lsst::qserv::cconfig { @@ -92,8 +92,9 @@ class ActiveWorker : public std::enable_shared_from_this { static std::string getStateStr(State st); - static Ptr create(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) { + static Ptr create(protojson::WorkerContactInfo::Ptr const& wInfo, + protojson::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, + std::string const& replicationAuthKey) { return Ptr(new ActiveWorker(wInfo, czInfo, replicationInstanceId, replicationAuthKey)); } @@ -105,14 +106,14 @@ class ActiveWorker : public std::enable_shared_from_this { _wqsData->setCzarCancelAfterRestart(czId, lastQId); } - http::WorkerContactInfo::Ptr getWInfo() const; + protojson::WorkerContactInfo::Ptr getWInfo() const; ~ActiveWorker() = default; /// Return true if there were differences in worker id, host, or port values. - bool compareContactInfo(http::WorkerContactInfo const& wcInfo) const; + bool compareContactInfo(protojson::WorkerContactInfo const& wcInfo) const; - void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wcInfo); + void setWorkerContactInfo(protojson::WorkerContactInfo::Ptr const& wcInfo); /// Check this workers state (by looking at contact information) and queue /// the WorkerQueryStatusData message `_wqsData` to be sent if this worker @@ -148,10 +149,11 @@ class ActiveWorker : public std::enable_shared_from_this { std::string dump() const; private: - ActiveWorker(http::WorkerContactInfo::Ptr const& wInfo, http::CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey) - : _wqsData(http::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, - replicationAuthKey)) { + ActiveWorker(protojson::WorkerContactInfo::Ptr const& wInfo, + protojson::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, + std::string const& replicationAuthKey) + : _wqsData(protojson::WorkerQueryStatusData::create(wInfo, czInfo, replicationInstanceId, + replicationAuthKey)) { if (_wqsData == nullptr) { throw util::Bug(ERR_LOC, "ActiveWorker _wqsData null"); } @@ -163,7 +165,7 @@ class ActiveWorker : public std::enable_shared_from_this { /// Send the `jsWorkerReqPtr` json message to the worker referenced by `wInf` to /// transmit the `_wqsData` state. - void _sendStatusMsg(http::WorkerContactInfo::Ptr const& wInf, + void _sendStatusMsg(protojson::WorkerContactInfo::Ptr const& wInf, std::shared_ptr const& jsWorkerReqPtr); /// Dump a log string for this object. @@ -172,7 +174,7 @@ class ActiveWorker : public std::enable_shared_from_this { /// Contains data that needs to be sent to workers about finished/cancelled /// user queries and UberJobs. It must not be null. - http::WorkerQueryStatusData::Ptr const _wqsData; + protojson::WorkerQueryStatusData::Ptr const _wqsData; State _state{QUESTIONABLE}; ///< current state of this worker. @@ -198,8 +200,9 @@ class ActiveWorkerMap { /// Use information gathered from the registry to update the map. The registry /// contains last contact time (used for determining aliveness) and worker contact information. - void updateMap(http::WorkerContactInfo::WCMap const& wcMap, http::CzarContactInfo::Ptr const& czInfo, - std::string const& replicationInstanceId, std::string const& replicationAuthKey); + void updateMap(protojson::WorkerContactInfo::WCMap const& wcMap, + protojson::CzarContactInfo::Ptr const& czInfo, std::string const& replicationInstanceId, + std::string const& replicationAuthKey); /// If this is to be called, it must be called before Czar::_monitor is started: /// It tells the workers all queries from `czId` with QueryIds less than `lastQId` diff --git a/src/czar/CMakeLists.txt b/src/czar/CMakeLists.txt index 3d9d32695..023b175b0 100644 --- a/src/czar/CMakeLists.txt +++ b/src/czar/CMakeLists.txt @@ -27,6 +27,7 @@ target_include_directories(czar PRIVATE target_link_libraries(czar PUBLIC cconfig http + protojson qdisp qhttp util diff --git a/src/czar/CzarRegistry.cc b/src/czar/CzarRegistry.cc index b1bbe7974..e81b0e168 100644 --- a/src/czar/CzarRegistry.cc +++ b/src/czar/CzarRegistry.cc @@ -70,7 +70,7 @@ CzarRegistry::~CzarRegistry() { } } -http::WorkerContactInfo::WCMapPtr CzarRegistry::getWorkerContactMap() const { +protojson::WorkerContactInfo::WCMapPtr CzarRegistry::getWorkerContactMap() const { lock_guard lockG(_cmapMtx); return _contactMap; } @@ -130,12 +130,12 @@ void CzarRegistry::_registryWorkerInfoLoop() { LOGS(_log, LOG_LVL_ERROR, requestContext + " was denied, error: '" + error + "'."); // TODO: Is there a better thing to do than just log this here? } else { - http::WorkerContactInfo::WCMapPtr wMap = _buildMapFromJson(response); + protojson::WorkerContactInfo::WCMapPtr wMap = _buildMapFromJson(response); // Update the values in the map { - auto czInfo = http::CzarContactInfo::create(_czarConfig->name(), _czarConfig->id(), - _czarConfig->replicationHttpPort(), - util::get_current_host_fqdn(), czarStartTime); + auto czInfo = protojson::CzarContactInfo::create( + _czarConfig->name(), _czarConfig->id(), _czarConfig->replicationHttpPort(), + util::get_current_host_fqdn(), czarStartTime); lock_guard lck(_cmapMtx); if (wMap != nullptr) { _contactMap = wMap; @@ -153,16 +153,16 @@ void CzarRegistry::_registryWorkerInfoLoop() { } } -http::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json const& response) { +protojson::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json const& response) { auto const& jsServices = response.at("services"); auto const& jsWorkers = jsServices.at("workers"); - auto wMap = http::WorkerContactInfo::WCMapPtr(new http::WorkerContactInfo::WCMap()); + auto wMap = protojson::WorkerContactInfo::WCMapPtr(new protojson::WorkerContactInfo::WCMap()); for (auto const& [key, value] : jsWorkers.items()) { auto const& jsQserv = value.at("qserv"); LOGS(_log, LOG_LVL_DEBUG, __func__ << " key=" << key << " jsQ=" << jsQserv); // The names for items here are different than the names used by workers. - auto wInfo = http::WorkerContactInfo::createFromJsonRegistry(key, jsQserv); + auto wInfo = protojson::WorkerContactInfo::createFromJsonRegistry(key, jsQserv); LOGS(_log, LOG_LVL_DEBUG, __func__ << " wInfot=" << wInfo->dump()); auto iter = wMap->find(key); @@ -180,7 +180,7 @@ http::WorkerContactInfo::WCMapPtr CzarRegistry::_buildMapFromJson(nlohmann::json return wMap; } -bool CzarRegistry::_compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const { +bool CzarRegistry::_compareMapContactInfo(protojson::WorkerContactInfo::WCMap const& other) const { VMUTEX_HELD(_cmapMtx); if (_contactMap == nullptr) { // If _contactMap is null, it needs to be replaced. @@ -202,8 +202,8 @@ bool CzarRegistry::_compareMapContactInfo(http::WorkerContactInfo::WCMap const& return true; } -http::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const { - http::WorkerContactInfo::WCMapPtr contMap = nullptr; +protojson::WorkerContactInfo::WCMapPtr CzarRegistry::waitForWorkerContactMap() const { + protojson::WorkerContactInfo::WCMapPtr contMap = nullptr; while (contMap == nullptr) { { lock_guard lockG(_cmapMtx); diff --git a/src/czar/CzarRegistry.h b/src/czar/CzarRegistry.h index b7233f15d..08d24a7bc 100644 --- a/src/czar/CzarRegistry.h +++ b/src/czar/CzarRegistry.h @@ -71,12 +71,12 @@ class CzarRegistry { /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. - http::WorkerContactInfo::WCMapPtr getWorkerContactMap() const; + protojson::WorkerContactInfo::WCMapPtr getWorkerContactMap() const; /// Return _contactMap, the object that the returned pointer points to is /// constant and no attempts should be made to change it. This /// function will wait forever for a valid contact map to be ready. - http::WorkerContactInfo::WCMapPtr waitForWorkerContactMap() const; + protojson::WorkerContactInfo::WCMapPtr waitForWorkerContactMap() const; /// Send all live workers the `WorkerQueryStatusData` message for /// that worker. This may result in the worker sending back the @@ -104,11 +104,11 @@ class CzarRegistry { void _registryWorkerInfoLoop(); /// Build a new WorkerContactMap from the json `response` - http::WorkerContactInfo::WCMapPtr _buildMapFromJson(nlohmann::json const& response); + protojson::WorkerContactInfo::WCMapPtr _buildMapFromJson(nlohmann::json const& response); /// Return true if maps are the same size and all of the elements have the same contact info. /// NOTE: _cmapMtx must be held when calling. - bool _compareMapContactInfo(http::WorkerContactInfo::WCMap const& other) const; + bool _compareMapContactInfo(protojson::WorkerContactInfo::WCMap const& other) const; std::shared_ptr const _czarConfig; ///< Pointer to the CzarConfig. @@ -117,7 +117,7 @@ class CzarRegistry { std::thread _czarWorkerInfoThrd; ///< This thread continuously collects worker contact information. /// Pointer to the map of worker contact information. - http::WorkerContactInfo::WCMapPtr _contactMap; + protojson::WorkerContactInfo::WCMapPtr _contactMap; TIMEPOINT _latestMapUpdate; ///< The last time the _contactMap was updated, unrelated to ///< WorkerContactInfo update. mutable MUTEX _cmapMtx; /// Protects _contactMap, _latestUpdate diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index 1c80e4c85..266fdbdbe 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -104,7 +104,7 @@ json HttpCzarWorkerModule::_handleJobError(string const& func) { // Parse and verify the json message and then kill the UberJob. json jsRet = {{"success", 0}, {"errortype", "unknown"}, {"note", "initialized"}}; try { - // See qdisp::UberJob::runUberJob() for json message construction. + // See qdisp::UberJob::runUberJob() for json message construction. &&& string const targetWorkerId = body().required("workerid"); string const czarName = body().required("czar"); qmeta::CzarId const czarId = body().required("czarid"); @@ -147,7 +147,7 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { try { // &&& TODO:UJ file response - move construction and parsing // &&& TODO:UJ to a class so it can be added to WorkerCzarComIssue - // See qdisp::UberJob::runUberJob() for json message construction. + // See qdisp::UberJob::runUberJob() for json message construction. &&& string const targetWorkerId = body().required("workerid"); string const czarName = body().required("czar"); qmeta::CzarId const czarId = body().required("czarid"); @@ -190,8 +190,8 @@ json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) { string const replicationInstanceId = cconfig::CzarConfig::instance()->replicationInstanceId(); string const replicationAuthKey = cconfig::CzarConfig::instance()->replicationAuthKey(); auto const& jsReq = body().objJson; - auto wccIssue = - http::WorkerCzarComIssue::createFromJson(jsReq, replicationInstanceId, replicationAuthKey); + auto wccIssue = protojson::WorkerCzarComIssue::createFromJson(jsReq, replicationInstanceId, + replicationAuthKey); auto wId = wccIssue->getWorkerInfo()->wId; if (wccIssue->getThoughtCzarWasDead()) { diff --git a/src/http/CMakeLists.txt b/src/http/CMakeLists.txt index 61097f9f2..f2ea0e0a8 100644 --- a/src/http/CMakeLists.txt +++ b/src/http/CMakeLists.txt @@ -19,7 +19,7 @@ target_sources(http PRIVATE RequestBodyJSON.cc RequestQuery.cc Url.cc - WorkerQueryStatusData.cc +# &&& WorkerQueryStatusData.cc ) target_link_libraries(http PUBLIC @@ -52,6 +52,6 @@ http_tests( testAsyncReq testRequestBodyJSON testRequestQuery - testStatusData +# &&& testStatusData testUrl ) diff --git a/src/protojson/CMakeLists.txt b/src/protojson/CMakeLists.txt new file mode 100644 index 000000000..3c6bbdf80 --- /dev/null +++ b/src/protojson/CMakeLists.txt @@ -0,0 +1,37 @@ +add_library(protojson SHARED) + +target_sources(protojson PRIVATE + WorkerQueryStatusData.cc +) + +target_link_libraries(protojson PUBLIC + curl + http + log + qhttp + util + Boost::filesystem + Boost::regex + Boost::system + cpp-httplib +) + +install(TARGETS protojson) + +function(PROTOJSON_TESTS) + foreach(TEST IN ITEMS ${ARGV}) + add_executable(${TEST} ${TEST}.cc) + target_link_libraries(${TEST} PUBLIC + global + http + protojson + Boost::unit_test_framework + Threads::Threads + ) + add_test(NAME ${TEST} COMMAND ${TEST}) + endforeach() +endfunction() + +protojson_tests( + testStatusData +) diff --git a/src/http/WorkerQueryStatusData.cc b/src/protojson/WorkerQueryStatusData.cc similarity index 90% rename from src/http/WorkerQueryStatusData.cc rename to src/protojson/WorkerQueryStatusData.cc index 8f4ac38f2..e8015c8db 100644 --- a/src/http/WorkerQueryStatusData.cc +++ b/src/protojson/WorkerQueryStatusData.cc @@ -20,9 +20,8 @@ */ // Class header -#include "http/WorkerQueryStatusData.h" +#include "protojson/WorkerQueryStatusData.h" -// System headers #include // Qserv headers @@ -39,10 +38,10 @@ using namespace std; using namespace nlohmann; namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.http.WorkerQueryStatusData"); +LOG_LOGGER _log = LOG_GET("lsst.qserv.protojson.WorkerQueryStatusData"); } // namespace -namespace lsst::qserv::http { +namespace lsst::qserv::protojson { json CzarContactInfo::serializeJson() const { json jsCzar; @@ -56,11 +55,11 @@ json CzarContactInfo::serializeJson() const { CzarContactInfo::Ptr CzarContactInfo::createFromJson(nlohmann::json const& czJson) { try { - auto czName_ = RequestBodyJSON::required(czJson, "name"); - auto czId_ = RequestBodyJSON::required(czJson, "id"); - auto czPort_ = RequestBodyJSON::required(czJson, "management-port"); - auto czHostName_ = RequestBodyJSON::required(czJson, "management-host-name"); - auto czStartupTime_ = RequestBodyJSON::required(czJson, "czar-startup-time"); + auto czName_ = http::RequestBodyJSON::required(czJson, "name"); + auto czId_ = http::RequestBodyJSON::required(czJson, "id"); + auto czPort_ = http::RequestBodyJSON::required(czJson, "management-port"); + auto czHostName_ = http::RequestBodyJSON::required(czJson, "management-host-name"); + auto czStartupTime_ = http::RequestBodyJSON::required(czJson, "czar-startup-time"); return create(czName_, czId_, czPort_, czHostName_, czStartupTime_); } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("CzarContactInfo::createJson invalid ") << exc.what()); @@ -93,10 +92,10 @@ json WorkerContactInfo::_serializeJson() const { WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonRegistry(string const& wId_, nlohmann::json const& regJson) { try { - auto wHost_ = RequestBodyJSON::required(regJson, "host-addr"); - auto wManagementHost_ = RequestBodyJSON::required(regJson, "management-host-name"); - auto wPort_ = RequestBodyJSON::required(regJson, "management-port"); - auto updateTimeInt = RequestBodyJSON::required(regJson, "update-time-ms"); + auto wHost_ = http::RequestBodyJSON::required(regJson, "host-addr"); + auto wManagementHost_ = http::RequestBodyJSON::required(regJson, "management-host-name"); + auto wPort_ = http::RequestBodyJSON::required(regJson, "management-port"); + auto updateTimeInt = http::RequestBodyJSON::required(regJson, "update-time-ms"); TIMEPOINT updateTime_ = TIMEPOINT(chrono::milliseconds(updateTimeInt)); return create(wId_, wHost_, wManagementHost_, wPort_, updateTime_); @@ -109,10 +108,10 @@ WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonRegistry(string const& w WorkerContactInfo::Ptr WorkerContactInfo::createFromJsonWorker(nlohmann::json const& wJson, TIMEPOINT updateTime_) { try { - auto wId_ = RequestBodyJSON::required(wJson, "id"); - auto wHost_ = RequestBodyJSON::required(wJson, "host"); - auto wManagementHost_ = RequestBodyJSON::required(wJson, "management-host-name"); - auto wPort_ = RequestBodyJSON::required(wJson, "management-port"); + auto wId_ = http::RequestBodyJSON::required(wJson, "id"); + auto wHost_ = http::RequestBodyJSON::required(wJson, "host"); + auto wManagementHost_ = http::RequestBodyJSON::required(wJson, "management-host-name"); + auto wPort_ = http::RequestBodyJSON::required(wJson, "management-port"); return create(wId_, wHost_, wManagementHost_, wPort_, updateTime_); } catch (invalid_argument const& exc) { @@ -276,10 +275,12 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); wqsData->parseLists(jsWorkerReq, updateTm); - bool czarRestart = RequestBodyJSON::required(jsWorkerReq, "czarrestart"); + bool czarRestart = http::RequestBodyJSON::required(jsWorkerReq, "czarrestart"); if (czarRestart) { - auto restartCzarId = RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelczid"); - auto restartQueryId = RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelqid"); + auto restartCzarId = + http::RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelczid"); + auto restartQueryId = + http::RequestBodyJSON::required(jsWorkerReq, "czarrestartcancelqid"); wqsData->setCzarCancelAfterRestart(restartCzarId, restartQueryId); } return wqsData; @@ -413,7 +414,7 @@ bool WorkerQueryStatusData::handleResponseJson(nlohmann::json const& jsResp) { } bool workerRestarted = false; - auto workerStartupTime = RequestBodyJSON::required(jsResp, "w-startup-time"); + auto workerStartupTime = http::RequestBodyJSON::required(jsResp, "w-startup-time"); LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " workerStartupTime=" << workerStartupTime); if (!_wInfo->checkWStartupTime(workerStartupTime)) { LOGS(_log, LOG_LVL_ERROR, @@ -478,7 +479,8 @@ WorkerCzarComIssue::Ptr WorkerCzarComIssue::createFromJson(nlohmann::json const& } auto wccIssue = create(replicationInstanceId_, replicationAuthKey_); wccIssue->setContactInfo(wInfo_, czInfo_); - wccIssue->_thoughtCzarWasDead = RequestBodyJSON::required(jsCzarReq, "thoughtczarwasdead"); + wccIssue->_thoughtCzarWasDead = + http::RequestBodyJSON::required(jsCzarReq, "thoughtczarwasdead"); return wccIssue; } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, string("WorkerQueryStatusData::createJson invalid ") << exc.what()); @@ -506,4 +508,4 @@ string WorkerCzarComIssue::_dump() const { return os.str(); } -} // namespace lsst::qserv::http +} // namespace lsst::qserv::protojson diff --git a/src/http/WorkerQueryStatusData.h b/src/protojson/WorkerQueryStatusData.h similarity index 98% rename from src/http/WorkerQueryStatusData.h rename to src/protojson/WorkerQueryStatusData.h index dbf961ec8..1e7132dbc 100644 --- a/src/http/WorkerQueryStatusData.h +++ b/src/protojson/WorkerQueryStatusData.h @@ -37,7 +37,7 @@ #include "util/Mutex.h" // This header declarations -namespace lsst::qserv::http { +namespace lsst::qserv::protojson { /// This class just contains the czar id and network contact information. class CzarContactInfo : public std::enable_shared_from_this { @@ -463,6 +463,16 @@ class WorkerCzarComIssue { mutable MUTEX _wciMtx; ///< protects all members. }; -} // namespace lsst::qserv::http +class WorkerUberJobMsg { +public: + using Ptr = std::shared_ptr; + + static Ptr create(); + +private: + WorkerUberJobMsg(); +}; + +} // namespace lsst::qserv::protojson #endif // LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H diff --git a/src/http/testStatusData.cc b/src/protojson/testStatusData.cc similarity index 83% rename from src/http/testStatusData.cc rename to src/protojson/testStatusData.cc index ba537d3ea..8dd226080 100644 --- a/src/http/testStatusData.cc +++ b/src/protojson/testStatusData.cc @@ -28,10 +28,8 @@ // Qserv headers #include "global/clock_defs.h" -#include "http/WorkerQueryStatusData.h" - -// LSST headers #include "lsst/log/Log.h" +#include "protojson/WorkerQueryStatusData.h" // Boost unit test header #define BOOST_TEST_MODULE RequestQuery @@ -39,7 +37,7 @@ using namespace std; namespace test = boost::test_tools; -using namespace lsst::qserv::http; +using namespace lsst::qserv::protojson; BOOST_AUTO_TEST_SUITE(Suite) @@ -55,15 +53,16 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { int czrPort = 2022; string const czrHost("cz_host"); - auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); + auto czarA = + lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); auto czarAJs = czarA->serializeJson(); - auto czarB = lsst::qserv::http::CzarContactInfo::createFromJson(czarAJs); + auto czarB = lsst::qserv::protojson::CzarContactInfo::createFromJson(czarAJs); BOOST_REQUIRE(czarA->compare(*czarB)); - auto czarC = - lsst::qserv::http::CzarContactInfo::create("different", czrId, czrPort, czrHost, cxrStartTime); + auto czarC = lsst::qserv::protojson::CzarContactInfo::create("different", czrId, czrPort, czrHost, + cxrStartTime); BOOST_REQUIRE(!czarA->compare(*czarC)); auto start = lsst::qserv::CLOCK::now(); @@ -78,15 +77,15 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { BOOST_REQUIRE(workerA->isSameContactInfo(*workerA1)); // WorkerQueryStatusData - auto wqsdA = lsst::qserv::http::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, - replicationAuthKey); + auto wqsdA = lsst::qserv::protojson::WorkerQueryStatusData::create(workerA, czarA, replicationInstanceId, + replicationAuthKey); double maxLifetime = 300.0; auto jsDataA = wqsdA->serializeJson(maxLifetime); // Check that empty lists work. - auto wqsdA1 = lsst::qserv::http::WorkerQueryStatusData::createFromJson(*jsDataA, replicationInstanceId, - replicationAuthKey, start1Sec); + auto wqsdA1 = lsst::qserv::protojson::WorkerQueryStatusData::createFromJson( + *jsDataA, replicationInstanceId, replicationAuthKey, start1Sec); auto jsDataA1 = wqsdA1->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsDataA1); @@ -108,7 +107,7 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { jsDataA = wqsdA->serializeJson(maxLifetime); auto start5Sec = start + 5s; - auto workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( + auto workerAFromJson = lsst::qserv::protojson::WorkerQueryStatusData::createFromJson( *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); auto jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); @@ -120,7 +119,7 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { jsDataA = wqsdA->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA != *jsWorkerAFromJson); - workerAFromJson = lsst::qserv::http::WorkerQueryStatusData::createFromJson( + workerAFromJson = lsst::qserv::protojson::WorkerQueryStatusData::createFromJson( *jsDataA, replicationInstanceId, replicationAuthKey, start5Sec); jsWorkerAFromJson = workerAFromJson->serializeJson(maxLifetime); BOOST_REQUIRE(*jsDataA == *jsWorkerAFromJson); @@ -153,7 +152,8 @@ BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { int czrPort = 2022; string const czrHost("cz_host"); - auto czarA = lsst::qserv::http::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); + auto czarA = + lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); auto czarAJs = czarA->serializeJson(); auto start = lsst::qserv::CLOCK::now(); @@ -161,7 +161,8 @@ BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { auto jsWorkerA = workerA->serializeJson(); // WorkerCzarComIssue - auto wccIssueA = lsst::qserv::http::WorkerCzarComIssue::create(replicationInstanceId, replicationAuthKey); + auto wccIssueA = + lsst::qserv::protojson::WorkerCzarComIssue::create(replicationInstanceId, replicationAuthKey); wccIssueA->setContactInfo(workerA, czarA); BOOST_REQUIRE(wccIssueA->needToSend() == false); wccIssueA->setThoughtCzarWasDead(true); @@ -169,8 +170,8 @@ BOOST_AUTO_TEST_CASE(WorkerCzarComIssue) { auto jsIssueA = wccIssueA->serializeJson(); - auto wccIssueA1 = lsst::qserv::http::WorkerCzarComIssue::createFromJson(*jsIssueA, replicationInstanceId, - replicationAuthKey); + auto wccIssueA1 = lsst::qserv::protojson::WorkerCzarComIssue::createFromJson( + *jsIssueA, replicationInstanceId, replicationAuthKey); auto jsIssueA1 = wccIssueA1->serializeJson(); BOOST_REQUIRE(*jsIssueA == *jsIssueA1); diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index fe24da7ce..ce719d50d 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -98,11 +98,11 @@ class UberJob : public std::enable_shared_from_this { /// Set the worker information needed to send messages to the worker believed to /// be responsible for the chunks handled in this UberJob. - void setWorkerContactInfo(http::WorkerContactInfo::Ptr const& wContactInfo) { + void setWorkerContactInfo(protojson::WorkerContactInfo::Ptr const& wContactInfo) { _wContactInfo = wContactInfo; } - http::WorkerContactInfo::Ptr getWorkerContactInfo() { return _wContactInfo; } + protojson::WorkerContactInfo::Ptr getWorkerContactInfo() { return _wContactInfo; } /// Get the data for the worker that should handle this UberJob. czar::CzarChunkMap::WorkerChunksData::Ptr getWorkerData() { return _workerData; } @@ -167,7 +167,7 @@ class UberJob : public std::enable_shared_from_this { czar::CzarChunkMap::WorkerChunksData::Ptr _workerData; // TODO:UJ this may not be needed // Contact information for the target worker. - http::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? + protojson::WorkerContactInfo::Ptr _wContactInfo; // Change to ActiveWorker &&& ??? }; } // namespace lsst::qserv::qdisp diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 6653e22a2..5252b3936 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -301,6 +301,131 @@ std::vector Task::createTasksForChunk( return vect; } +//&&& +std::vector Task::createTasksForUnitTest( + std::shared_ptr const& ujData, nlohmann::json const& jsJobs, + std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, + bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& chunkResourceMgr + //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, + //&&&std::shared_ptr const& queriesAndChunks, + //&&&uint16_t resultsHttpPort = 8080) { +) { + QueryId qId = ujData->getQueryId(); + UberJobId ujId = ujData->getUberJobId(); + CzarIdType czId = ujData->getCzarId(); + + //&&& wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); + //&&& UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); + + string funcN(__func__); + funcN += " QID=" + to_string(qId) + " "; + + vector vect; + for (auto const& job : jsJobs) { + json const& jsJobDesc = job["jobdesc"]; + http::RequestBodyJSON rbJobDesc(jsJobDesc); + // See qproc::TaskMsgFactory::makeMsgJson for message construction. + auto const jdCzarId = rbJobDesc.required("czarId"); + auto const jdQueryId = rbJobDesc.required("queryId"); + if (jdQueryId != qId) { + throw TaskException(ERR_LOC, string("ujId=") + to_string(ujId) + " qId=" + to_string(qId) + + " QueryId mismatch Job qId=" + to_string(jdQueryId)); + } + auto const jdJobId = rbJobDesc.required("jobId"); + auto const jdAttemptCount = rbJobDesc.required("attemptCount"); + auto const jdQuerySpecDb = rbJobDesc.required("querySpecDb"); + auto const jdScanPriority = rbJobDesc.required("scanPriority"); + auto const jdScanInteractive = rbJobDesc.required("scanInteractive"); + auto const jdMaxTableSizeMb = rbJobDesc.required("maxTableSize"); + auto const jdChunkId = rbJobDesc.required("chunkId"); + LOGS(_log, LOG_LVL_TRACE, + funcN << " jd cid=" << jdCzarId << " jdQId=" << jdQueryId << " jdJobId=" << jdJobId + << " jdAtt=" << jdAttemptCount << " jdQDb=" << jdQuerySpecDb + << " jdScanPri=" << jdScanPriority << " interactive=" << jdScanInteractive + << " maxTblSz=" << jdMaxTableSizeMb << " chunkId=" << jdChunkId); + + auto const jdQueryFragments = rbJobDesc.required("queryFragments"); + int fragmentNumber = 0; + for (auto const& frag : jdQueryFragments) { + vector fragSubQueries; + vector fragSubchunkIds; + vector fragSubTables; + LOGS(_log, LOG_LVL_DEBUG, funcN << " frag=" << frag); + http::RequestBodyJSON rbFrag(frag); + auto const& jsQueries = rbFrag.required("queries"); + // TODO:UJ move to uberjob???, these should be the same for all jobs + for (auto const& subQ : jsQueries) { + http::RequestBodyJSON rbSubQ(subQ); + auto const subQuery = rbSubQ.required("subQuery"); + LOGS(_log, LOG_LVL_DEBUG, funcN << " subQuery=" << subQuery); + fragSubQueries.push_back(subQuery); + } + auto const& resultTable = rbFrag.required("resultTable"); + auto const& jsSubIds = rbFrag.required("subchunkIds"); + for (auto const& scId : jsSubIds) { + fragSubchunkIds.push_back(scId); + } + auto const& jsSubTables = rbFrag.required("subchunkTables"); + + for (auto const& scDbTable : jsSubTables) { // TODO:UJ are these the same for all jobs? + http::RequestBodyJSON rbScDbTable(scDbTable); + string scDb = rbScDbTable.required("scDb"); + string scTable = rbScDbTable.required("scTable"); + TaskDbTbl scDbTbl(scDb, scTable); + fragSubTables.push_back(scDbTbl); + } + + for (string const& fragSubQ : fragSubQueries) { + //&&&size_t templateId = userQueryInfo->addTemplate(fragSubQ); + if (fragSubchunkIds.empty()) { + bool const noSubchunks = false; + int const subchunkId = -1; + /* &&& + auto task = Task::Ptr(new Task( + ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, + noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); + */ + auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, + 0, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, + scanInteractive, maxTableSizeMb, fragSubTables, + fragSubchunkIds, sendChannel, nullptr, 0)); + + vect.push_back(task); + } else { + for (auto subchunkId : fragSubchunkIds) { + bool const hasSubchunks = true; + /* &&& + auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, + fragmentNumber, templateId, hasSubchunks, subchunkId, + jdQuerySpecDb, scanInfo, scanInteractive, + maxTableSizeMb, fragSubTables, fragSubchunkIds, + sendChannel, queryStats, resultsHttpPort)); + */ + auto task = Task::Ptr(new Task( + ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, 0, hasSubchunks, + subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + fragSubTables, fragSubchunkIds, sendChannel, nullptr, 0)); + + vect.push_back(task); + } + } + } + ++fragmentNumber; + } + } + + /* &&& shouldn't need this + for (auto taskPtr : vect) { + // newQueryRunner sets the `_taskQueryRunner` pointer in `task`. + taskPtr->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(taskPtr, chunkResourceMgr, mySqlConfig, + sqlConnMgr, queriesAndChunks)); + } + */ + return vect; +} + void Task::action(util::CmdData* data) { string tIdStr = getIdStr(); if (_queryStarted.exchange(true)) { diff --git a/src/wbase/Task.h b/src/wbase/Task.h index e3ba8b336..2ac709e48 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -174,6 +174,18 @@ class Task : public util::CommandForThreadPool { std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort = 8080); + //&&& + static std::vector createTasksForUnitTest( + std::shared_ptr const& ujData, nlohmann::json const& jsJobs, + std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, + bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& chunkResourceMgr + //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& + //sqlConnMgr, + //&&&std::shared_ptr const& queriesAndChunks, + //&&&uint16_t resultsHttpPort = 8080); + ); + std::shared_ptr getSendChannel() const { return _sendChannel; } void resetSendChannel() { _sendChannel.reset(); } ///< reset the shared pointer for FileChannelShared std::string user; ///< Incoming username diff --git a/src/wcontrol/WCzarInfoMap.cc b/src/wcontrol/WCzarInfoMap.cc index 4e7aa1196..831022c44 100644 --- a/src/wcontrol/WCzarInfoMap.cc +++ b/src/wcontrol/WCzarInfoMap.cc @@ -31,7 +31,7 @@ // qserv headers #include "http/Client.h" -#include "http/WorkerQueryStatusData.h" +#include "protojson/WorkerQueryStatusData.h" #include "util/Bug.h" #include "util/Histogram.h" #include "wbase/UberJobData.h" @@ -54,7 +54,7 @@ namespace lsst::qserv::wcontrol { WCzarInfo::WCzarInfo(CzarIdType czarId_) : czarId(czarId_), - _workerCzarComIssue(http::WorkerCzarComIssue::create( + _workerCzarComIssue(protojson::WorkerCzarComIssue::create( wconfig::WorkerConfig::instance()->replicationInstanceId(), wconfig::WorkerConfig::instance()->replicationAuthKey())) {} @@ -68,8 +68,8 @@ void WCzarInfo::czarMsgReceived(TIMEPOINT tm) { } } -void WCzarInfo::sendWorkerCzarComIssueIfNeeded(http::WorkerContactInfo::Ptr const& wInfo_, - http::CzarContactInfo::Ptr const& czInfo_) { +void WCzarInfo::sendWorkerCzarComIssueIfNeeded(protojson::WorkerContactInfo::Ptr const& wInfo_, + protojson::CzarContactInfo::Ptr const& czInfo_) { unique_lock uniLock(_wciMtx); if (_workerCzarComIssue->needToSend()) { // Having more than one of this message being sent at one time diff --git a/src/wcontrol/WCzarInfoMap.h b/src/wcontrol/WCzarInfoMap.h index 11703350a..46f297daf 100644 --- a/src/wcontrol/WCzarInfoMap.h +++ b/src/wcontrol/WCzarInfoMap.h @@ -33,11 +33,11 @@ #include "global/clock_defs.h" #include "global/intTypes.h" -namespace lsst::qserv::http { +namespace lsst::qserv::protojson { class CzarContactInfo; class WorkerContactInfo; class WorkerCzarComIssue; -} // namespace lsst::qserv::http +} // namespace lsst::qserv::protojson namespace lsst::qserv::wbase { class UJTransmitCmd; @@ -67,8 +67,8 @@ class WCzarInfo : public std::enable_shared_from_this { static Ptr create(CzarIdType czarId_) { return Ptr(new WCzarInfo(czarId_)); } /// If there were communication issues, start a thread to send the WorkerCzarComIssue message. - void sendWorkerCzarComIssueIfNeeded(std::shared_ptr const& wInfo_, - std::shared_ptr const& czInfo_); + void sendWorkerCzarComIssueIfNeeded(std::shared_ptr const& wInfo_, + std::shared_ptr const& czInfo_); /// Called by the worker after the czar successfully replied to the original /// message from the worker. @@ -79,7 +79,7 @@ class WCzarInfo : public std::enable_shared_from_this { /// Check if the czar is still considered to be alive, or it timed out. bool checkAlive(TIMEPOINT tmMark); - std::shared_ptr getWorkerCzarComIssue(); + std::shared_ptr getWorkerCzarComIssue(); CzarIdType const czarId; @@ -93,7 +93,7 @@ class WCzarInfo : public std::enable_shared_from_this { /// This class tracks communication problems and prepares a message /// to inform the czar of the problem. - std::shared_ptr _workerCzarComIssue; + std::shared_ptr _workerCzarComIssue; mutable std::mutex _wciMtx; ///< protects all private members. /// true when running a thread to send a message to the czar diff --git a/src/wsched/testSchedulers.cc b/src/wsched/testSchedulers.cc index 4bf41ec08..b28b6b060 100644 --- a/src/wsched/testSchedulers.cc +++ b/src/wsched/testSchedulers.cc @@ -103,7 +103,7 @@ struct SchedulerFixture { SchedulerFixture(void) { counter = 20; } ~SchedulerFixture(void) {} - /* &&& Instead of using messages, make a Task::createUnitTest() function + /* &&& Instead of using messages, make a Task::createTasksForUnitTest() function void addSomeFragments(TaskMsgPtr const& t, int numberOfFragments) { for (int i = 0; i < numberOfFragments; ++i) { TaskMsg::Fragment* f = t->add_fragment(); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 594fcec5f..08b153682 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -36,8 +36,8 @@ #include "http/MetaModule.h" #include "http/RequestBodyJSON.h" #include "http/RequestQuery.h" -#include "http/WorkerQueryStatusData.h" #include "mysql/MySqlUtils.h" +#include "protojson/WorkerQueryStatusData.h" #include "qmeta/types.h" #include "util/String.h" #include "util/Timer.h" @@ -251,8 +251,8 @@ json HttpWorkerCzarModule::_handleQueryStatus(std::string const& func) { auto const replicationAuthKey = workerConfig->replicationAuthKey(); auto const& jsReq = body().objJson; - auto wqsData = http::WorkerQueryStatusData::createFromJson(jsReq, replicationInstanceId, - replicationAuthKey, now); + auto wqsData = protojson::WorkerQueryStatusData::createFromJson(jsReq, replicationInstanceId, + replicationAuthKey, now); auto const czInfo = wqsData->getCzInfo(); LOGS(_log, LOG_LVL_TRACE, " HttpWorkerCzarModule::_handleQueryStatus req=" << jsReq.dump()); From 831b5eca59f9eb436a97ce652275d8ba41724536 Mon Sep 17 00:00:00 2001 From: Fritz Mueller Date: Tue, 26 Nov 2024 06:02:32 +0000 Subject: [PATCH 15/22] clang-format --- src/qdisp/testQDisp.cc | 6 +++--- src/wbase/SendChannel.h | 2 +- src/wbase/Task.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 1afb8712f..c0759ce60 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -108,10 +108,10 @@ class TestInfo : public ResponseHandler { return {true, false}; } void flushHttpError(int errorCode, std::string const& errorMsg, int status) override {} - void errorFlush(std::string const& msg, int code) override{}; + void errorFlush(std::string const& msg, int code) override {}; Error getError() const override { return util::Error(); } - void processCancel() override{}; - void prepScrubResults(int jobId, int attempt) override{}; + void processCancel() override {}; + void prepScrubResults(int jobId, int attempt) override {}; /// Print a string representation of the receiver to an ostream std::ostream& print(std::ostream& os) const override { diff --git a/src/wbase/SendChannel.h b/src/wbase/SendChannel.h index 8ba90ea4a..56f2a598c 100644 --- a/src/wbase/SendChannel.h +++ b/src/wbase/SendChannel.h @@ -68,5 +68,5 @@ class SendChannel { std::atomic _destroying{false}; }; -}} // namespace lsst::qserv::wbase +}} // namespace lsst::qserv::wbase #endif // LSST_QSERV_WBASE_SENDCHANNEL_H diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 2ac709e48..118037edf 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -181,7 +181,7 @@ class Task : public util::CommandForThreadPool { bool scanInteractive, int maxTableSizeMb, std::shared_ptr const& chunkResourceMgr //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& - //sqlConnMgr, + // sqlConnMgr, //&&&std::shared_ptr const& queriesAndChunks, //&&&uint16_t resultsHttpPort = 8080); ); From 9c4c602696dd8eb0d484f8686644a830f18baa2b Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 22 Nov 2024 07:33:48 -0800 Subject: [PATCH 16/22] Added unit test. --- src/protojson/CMakeLists.txt | 2 + src/protojson/UberJobMsg.cc | 700 +++++++++++++++++++++++++ src/protojson/UberJobMsg.h | 297 +++++++++++ src/protojson/WorkerQueryStatusData.cc | 1 + src/protojson/WorkerQueryStatusData.h | 6 +- src/protojson/testUberJobMsg.cc | 108 ++++ src/qdisp/JobDescription.cc | 5 +- src/qdisp/JobDescription.h | 2 + src/qdisp/UberJob.cc | 42 ++ src/xrdsvc/HttpWorkerCzarModule.cc | 8 +- 10 files changed, 1165 insertions(+), 6 deletions(-) create mode 100644 src/protojson/UberJobMsg.cc create mode 100644 src/protojson/UberJobMsg.h create mode 100644 src/protojson/testUberJobMsg.cc diff --git a/src/protojson/CMakeLists.txt b/src/protojson/CMakeLists.txt index 3c6bbdf80..8bb732635 100644 --- a/src/protojson/CMakeLists.txt +++ b/src/protojson/CMakeLists.txt @@ -1,6 +1,7 @@ add_library(protojson SHARED) target_sources(protojson PRIVATE + UberJobMsg.cc WorkerQueryStatusData.cc ) @@ -34,4 +35,5 @@ endfunction() protojson_tests( testStatusData + testUberJobMsg ) diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc new file mode 100644 index 000000000..b19d4c1e2 --- /dev/null +++ b/src/protojson/UberJobMsg.cc @@ -0,0 +1,700 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "protojson/UberJobMsg.h" + +#include + +// Qserv headers +#include "http/Client.h" +#include "http/MetaModule.h" +#include "http/RequestBodyJSON.h" +#include "qdisp/JobQuery.h" +#include "qdisp/JobDescription.h" +#include "qproc/ChunkQuerySpec.h" +#include "util/common.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; +using namespace nlohmann; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.protojson.UberJobMsg"); +} // namespace + +namespace lsst::qserv::protojson { + +UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationInstanceId, + std::string const& replicationAuthKey, + //&&&CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, + CzarContactInfo::Ptr const& czInfo, string const& workerId, QueryId qId, + UberJobId ujId, int rowLimit, int maxTableSizeMB, + std::vector> const& jobs) + : _metaVersion(metaVersion), + _replicationInstanceId(replicationInstanceId), + _replicationAuthKey(replicationAuthKey), + _czInfo(czInfo), + _workerId(workerId), + //&&&_workerId(wInfo->wId), + //&&&_wInfo(wInfo), + _qId(qId), + _ujId(ujId), + _rowLimit(rowLimit), + _maxTableSizeMB(maxTableSizeMB) { + //&&&_jobs(jobs) { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg start"); + + for (auto& jobPtr : jobs) { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg loop"); + // This creates the JobMsg objects for all relates jobs and their fragments. + auto jobMsg = JobMsg::create(jobPtr, _jobSubQueryTempMap, _jobDbTablesMap); + _jobMsgVect.push_back(jobMsg); + } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg end"); +} + +json UberJobMsg::serializeJson() const { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson a"); + + json ujmJson = {{"version", _metaVersion}, + {"instance_id", _replicationInstanceId}, + {"auth_key", _replicationAuthKey}, + {"worker", _workerId}, + {"queryid", _qId}, + {"uberjobid", _ujId}, + {"czarinfo", _czInfo->serializeJson()}, + {"rowlimit", _rowLimit}, + {"subqueries_map", _jobSubQueryTempMap->serializeJson()}, + {"dbtables_map", _jobDbTablesMap->serializeJson()}, + {"maxtablesizemb", _maxTableSizeMB}, + {"jobs", json::array()}}; + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson b"); + + auto& jsJobs = ujmJson["jobs"]; + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c"); + for (auto const& jbMsg : _jobMsgVect) { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c1"); + json jsJob = jbMsg->serializeJson(); + jsJobs.push_back(jsJob); + } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson d"); + + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& ujmJson=" << ujmJson); + + return ujmJson; +} + +UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson a"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson ujmJson=" << ujmJson); + try { + if (ujmJson["version"] != http::MetaModule::version) { + LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson bad version " << ujmJson["version"]); + return nullptr; + } + + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b"); + auto czInfo_ = CzarContactInfo::createFromJson(ujmJson["czarinfo"]); + if (czInfo_ == nullptr) { + LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson czar could not be parsed in " << ujmJson); + return nullptr; + } + + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson c"); + auto metaVersion = http::RequestBodyJSON::required(ujmJson, "version"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson d"); + auto replicationInstanceId = http::RequestBodyJSON::required(ujmJson, "instance_id"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson e"); + auto replicationAuthKey = http::RequestBodyJSON::required(ujmJson, "auth_key"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson f"); + auto workerId = http::RequestBodyJSON::required(ujmJson, "worker"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson g"); + auto qId = http::RequestBodyJSON::required(ujmJson, "queryid"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson h"); + auto ujId = http::RequestBodyJSON::required(ujmJson, "uberjobid"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson i"); + auto rowLimit = http::RequestBodyJSON::required(ujmJson, "rowlimit"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson j"); + auto maxTableSizeMB = http::RequestBodyJSON::required(ujmJson, "maxtablesizemb"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson k"); + auto czInfo = CzarContactInfo::createFromJson(ujmJson["czarinfo"]); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson l"); + auto jsUjJobs = http::RequestBodyJSON::required(ujmJson, "jobs"); + + LOGS(_log, LOG_LVL_INFO, + " &&& " << metaVersion << replicationInstanceId << replicationAuthKey << workerId << qId << ujId + << rowLimit << jsUjJobs); + + std::vector> emptyJobs; + + Ptr ujmPtr = Ptr(new UberJobMsg(metaVersion, replicationInstanceId, replicationAuthKey, czInfo, + workerId, qId, ujId, rowLimit, maxTableSizeMB, emptyJobs)); + + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson m"); + auto const& jsSubQueriesMap = http::RequestBodyJSON::required(ujmJson, "subqueries_map"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson n"); + ujmPtr->_jobSubQueryTempMap = JobSubQueryTempMap::createFromJson(jsSubQueriesMap); + + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson o"); + auto jsDbTablesMap = http::RequestBodyJSON::required(ujmJson, "dbtables_map"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson p"); + ujmPtr->_jobDbTablesMap = JobDbTablesMap::createFromJson(jsDbTablesMap); + + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson q"); + for (auto const& jsUjJob : jsUjJobs) { + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson q1"); + JobMsg::Ptr jobMsgPtr = + JobMsg::createFromJson(jsUjJob, ujmPtr->_jobSubQueryTempMap, ujmPtr->_jobDbTablesMap); + ujmPtr->_jobMsgVect.push_back(jobMsgPtr); + } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson end"); + + return ujmPtr; + } catch (invalid_argument const& exc) { + LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson invalid " << exc.what() << " json=" << ujmJson); + } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson end error"); + return nullptr; +} + +std::string UberJobMsg::dump() const { + stringstream os; + os << "&&& NEEDS CODE"; + return os.str(); +} + +JobMsg::Ptr JobMsg::create(std::shared_ptr const& jobPtr, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap) { + auto jMsg = Ptr(new JobMsg(jobPtr, jobSubQueryTempMap, jobDbTablesMap)); + return jMsg; +} + +JobMsg::JobMsg(std::shared_ptr const& jobPtr, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) + : _jobSubQueryTempMap(jobSubQueryTempMap), _jobDbTablesMap(jobDbTablesMap) { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg start"); + auto const descr = jobPtr->getDescription(); + if (descr == nullptr) { + throw util::Bug(ERR_LOC, cName(__func__) + " description=null for job=" + jobPtr->getIdStr()); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg a"); + auto chunkQuerySpec = descr->getChunkQuerySpec(); + _jobId = descr->id(); + //&&&{"attemptCount", attemptCount}, + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg b"); + _attemptCount = descr->getAttemptCount(); // &&& may need to increment descr->AttemptCount at this time. + //&&&{"querySpecDb", chunkQuerySpec.db}, + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg c"); + _chunkQuerySpecDb = chunkQuerySpec->db; + //&&&{"scanPriority", chunkQuerySpec.scanInfo.scanRating}, + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg d"); + _scanRating = chunkQuerySpec->scanInfo.scanRating; + //&&&{"scanInteractive", chunkQuerySpec.scanInteractive}, + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg e"); + _scanInteractive = chunkQuerySpec->scanInteractive; + //&&&{"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, + //_maxTableSizeMB; // &&& move up to UberJob + //&&&{"chunkScanTables", nlohmann::json::array()}, + //&&&{"chunkId", chunkQuerySpec.chunkId}, + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg f"); + _chunkId = chunkQuerySpec->chunkId; + //&&&{"queryFragments", nlohmann::json::array()}})); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg g"); + _chunkResultName = descr->getChunkResultName(); + + // Add scan tables (&&& not sure is this is the same for all jobs or not) + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h"); + for (auto const& sTbl : chunkQuerySpec->scanInfo.infoTables) { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h1"); + /* &&& + nlohmann::json cst = {{"db", sTbl.db}, + {"table", sTbl.table}, + {"lockInMemory", sTbl.lockInMemory}, + {"tblScanRating", sTbl.scanRating}}; + chunkScanTables.push_back(move(cst)); + */ + int index = jobDbTablesMap->findDbTable(make_pair(sTbl.db, sTbl.table)); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h2"); + jobDbTablesMap->setScanRating(index, sTbl.scanRating, sTbl.lockInMemory); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h3"); + _chunkScanTableIndexes.push_back(index); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h4"); + } + + // Add fragments + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg i"); + _jobFragments = + JobFragment::createVect(*chunkQuerySpec, jobSubQueryTempMap, jobDbTablesMap, _chunkResultName); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg end"); +} + +nlohmann::json JobMsg::serializeJson() const { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson a"); + auto jsJobMsg = + nlohmann::json({//&&&{"czarId", czarId}, + //&&&{"queryId", queryId}, + {"jobId", _jobId}, + {"attemptCount", _attemptCount}, + {"querySpecDb", _chunkQuerySpecDb}, + {"scanPriority", _scanRating}, + {"scanInteractive", _scanInteractive}, + //&&&{"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, + //&&&{"chunkScanTables", nlohmann::json::array()}, + {"chunkId", _chunkId}, + {"chunkresultname", _chunkResultName}, + {"chunkscantables_indexes", nlohmann::json::array()}, + {"queryFragments", json::array()}}); + + // These are indexes into _jobDbTablesMap, which is shared between all JobMsg in this UberJobMsg. + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson b"); + auto& jsqCstIndexes = jsJobMsg["chunkscantables_indexes"]; + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson c"); + for (auto const& index : _chunkScanTableIndexes) { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson c1"); + jsqCstIndexes.push_back(index); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson d"); + auto& jsqFrags = jsJobMsg["queryFragments"]; + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e"); + for (auto& jFrag : _jobFragments) { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e1"); + auto jsFrag = jFrag->serializeJson(); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e2"); + jsqFrags.push_back(jsFrag); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson end"); + return jsJobMsg; +} + +JobMsg::JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap, + JobId jobId, int attemptCount, std::string const& chunkQuerySpecDb, int scanRating, + bool scanInteractive, int chunkId, std::string const& chunkResultName) + : _jobId(jobId), + _attemptCount(attemptCount), + _chunkQuerySpecDb(chunkQuerySpecDb), + _scanRating(scanRating), + _scanInteractive(scanInteractive), + _chunkId(chunkId), + _chunkResultName(chunkResultName), + _jobSubQueryTempMap(jobSubQueryTempMap), + _jobDbTablesMap(jobDbTablesMap) {} + +JobMsg::Ptr JobMsg::createFromJson(nlohmann::json const& ujJson, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap) { + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson a"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson ujJson=" << ujJson); + JobId jobId = http::RequestBodyJSON::required(ujJson, "jobId"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson b"); + int attemptCount = http::RequestBodyJSON::required(ujJson, "attemptCount"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson c"); + string chunkQuerySpecDb = http::RequestBodyJSON::required(ujJson, "querySpecDb"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson d"); + int scanRating = http::RequestBodyJSON::required(ujJson, "scanPriority"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson e"); + bool scanInteractive = http::RequestBodyJSON::required(ujJson, "scanInteractive"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson f"); + int chunkId = http::RequestBodyJSON::required(ujJson, "chunkId"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson g"); + string chunkResultName = http::RequestBodyJSON::required(ujJson, "chunkresultname"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson h"); + + json jsQFrags = http::RequestBodyJSON::required(ujJson, "queryFragments"); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson i"); + + Ptr jMsgPtr = Ptr(new JobMsg(jobSubQueryTempMap, jobDbTablesMap, jobId, attemptCount, chunkQuerySpecDb, + scanRating, scanInteractive, chunkId, chunkResultName)); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson j"); + jMsgPtr->_jobFragments = JobFragment::createVectFromJson( + jsQFrags, jMsgPtr->_jobSubQueryTempMap, jMsgPtr->_jobDbTablesMap, jMsgPtr->_chunkResultName); + + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson end"); + return jMsgPtr; +} + +json JobSubQueryTempMap::serializeJson() const { + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson a"); + + // std::map _qTemplateMap; + json jsSubQueryTemplateMap = {{"subquerytemplate_map", json::array()}}; + + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson b"); + LOGS(_log, LOG_LVL_WARN, + "&&& JobSubQueryTempMap::serializeJson jsSubQueryTemplateMap=" << jsSubQueryTemplateMap); + auto& jsSqtMap = jsSubQueryTemplateMap["subquerytemplate_map"]; + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c"); + for (auto const& [key, templ] : _qTemplateMap) { + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c1"); + json jsElem = {{"index", key}, {"template", templ}}; + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c2"); + jsSqtMap.push_back(jsElem); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson e"); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsSqtMap); + + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson end"); + return jsSubQueryTemplateMap; +} + +JobSubQueryTempMap::Ptr JobSubQueryTempMap::createFromJson(nlohmann::json const& ujJson) { + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson a"); + Ptr sqtMapPtr = create(); + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson b"); + auto& sqtMap = sqtMapPtr->_qTemplateMap; + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::createFromJson " << ujJson); + auto const& jsElements = ujJson["subquerytemplate_map"]; + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c"); + for (auto const& jsElem : jsElements) { + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c1"); + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson jsElem=" << jsElem); + //&&&int index = jsElem["index"]; + int index = http::RequestBodyJSON::required(jsElem, "index"); + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c2"); + //&&&string templ = jsElem["template"]; + string templ = http::RequestBodyJSON::required(jsElem, "template"); + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c3"); + auto res = sqtMap.insert(make_pair(index, templ)); + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c4"); + if (!res.second) { + throw invalid_argument(sqtMapPtr->cName(__func__) + "index=" + to_string(index) + "=" + templ + + " index already found in " + to_string(ujJson)); + } + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c5"); + } + LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson end"); + return sqtMapPtr; +} + +int JobSubQueryTempMap::findSubQueryTemp(string const& qTemp) { + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp start"); + // The expected number of templates is expected to be small, less than 4, + // so this shouldn't be horribly expensive. + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp qTemp=" << qTemp); + for (auto const& [key, temp] : _qTemplateMap) { + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp key=" << key << " t=" << temp); + if (temp == qTemp) { + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end key=" << key); + return key; + } + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp endloop"); + // Need to insert + int index = _qTemplateMap.size(); + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp index=" << index); + _qTemplateMap[index] = qTemp; + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end"); + return index; +} + +int JobDbTablesMap::findDbTable(pair const& dbTablePair) { + // The expected number of templates is expected to be small, less than 4, + // so this shouldn't be horribly expensive. + for (auto const& [key, dbTbl] : _dbTableMap) { + if (dbTablePair == dbTbl) { + return key; + } + } + + // Need to insert + int index = _dbTableMap.size(); + _dbTableMap[index] = dbTablePair; + return index; +} + +json JobDbTablesMap::serializeJson() const { + json jsDbTablesMap = {{"dbtable_map", json::array()}, {"scanrating_map", json::array()}}; + + auto& jsDbTblMap = jsDbTablesMap["dbtable_map"]; + for (auto const& [key, valPair] : _dbTableMap) { + json jsDbTbl = {{"index", key}, {"db", valPair.first}, {"table", valPair.second}}; + jsDbTblMap.push_back(jsDbTbl); + } + + auto& jsScanRatingMap = jsDbTablesMap["scanrating_map"]; + for (auto const& [key, valPair] : _scanRatingMap) { + json jsScanR = {{"index", key}, {"scanrating", valPair.first}, {"lockinmem", valPair.second}}; + jsScanRatingMap.push_back(jsScanR); + } + + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsDbTablesMap); + + return jsDbTablesMap; +} + +JobDbTablesMap::Ptr JobDbTablesMap::createFromJson(nlohmann::json const& ujJson) { + Ptr dbTablesMapPtr = create(); + auto& dbTblMap = dbTablesMapPtr->_dbTableMap; + auto& scanRMap = dbTablesMapPtr->_scanRatingMap; + + LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson " << ujJson); + + json const& jsDbTbl = ujJson["dbtable_map"]; + LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson dbtbl=" << jsDbTbl); + for (auto const& jsElem : jsDbTbl) { + //&&&int index = jsElem["index"]; + int index = http::RequestBodyJSON::required(jsElem, "index"); + //&&&string db = jsElem["db"]; + string db = http::RequestBodyJSON::required(jsElem, "db"); + //&&&string tbl = jsElem["table"]; + string tbl = http::RequestBodyJSON::required(jsElem, "table"); + auto res = dbTblMap.insert(make_pair(index, make_pair(db, tbl))); + if (!res.second) { + throw invalid_argument(dbTablesMapPtr->cName(__func__) + " index=" + to_string(index) + "=" + db + + +"." + tbl + " index already found in " + to_string(jsDbTbl)); + } + } + + json const& jsScanR = ujJson["scanrating_map"]; + LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson jsScanR=" << jsScanR); + for (auto const& jsElem : jsScanR) { + //&&&int index = jsElem["index"]; + int index = http::RequestBodyJSON::required(jsElem, "index"); + //&&&int scanR = jsElem["scanrating"]; + int scanR = http::RequestBodyJSON::required(jsElem, "scanrating"); + //&&&bool lockInMem = jsElem["lockinmem"]; + bool lockInMem = http::RequestBodyJSON::required(jsElem, "lockinmem"); + auto res = scanRMap.insert(make_pair(index, make_pair(scanR, lockInMem))); + if (!res.second) { + throw invalid_argument(dbTablesMapPtr->cName(__func__) + " index=" + to_string(index) + "=" + + to_string(scanR) + +", " + to_string(lockInMem) + + " index already found in " + to_string(jsDbTbl)); + } + } + + return dbTablesMapPtr; +} + +void JobDbTablesMap::setScanRating(int index, int scanRating, bool lockInMemory) { + auto iter = _scanRatingMap.find(index); + if (iter == _scanRatingMap.end()) { + _scanRatingMap[index] = make_pair(scanRating, lockInMemory); + } else { + auto& elem = *iter; + auto& pr = elem.second; + auto& [sRating, lInMem] = pr; + if (sRating != scanRating || lInMem != lockInMemory) { + auto [dbName, tblName] = getDbTable(index); + LOGS(_log, LOG_LVL_ERROR, + cName(__func__) << " unexpected change in scanRating for " << dbName << "." << tblName + << " from " << sRating << " to " << scanRating << " lockInMemory from " + << lInMem << " to " << lockInMemory); + if (scanRating > sRating) { + sRating = scanRating; + lInMem = lockInMemory; + } + } + } +} + +JobFragment::JobFragment(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap, std::string const& resultTblName) + : _jobSubQueryTempMap(jobSubQueryTempMap), + _jobDbTablesMap(jobDbTablesMap), + _resultTblName(resultTblName) { + LOGS(_log, LOG_LVL_WARN, + "&&& JobFragment::JobFragment _jobSubQueryTempMap!=nullptr=" << (_jobSubQueryTempMap != nullptr)); + LOGS(_log, LOG_LVL_WARN, + "&&& JobFragment::JobFragment _jobDbTablesMap!=nullptr=" << (_jobDbTablesMap != nullptr)); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::JobFragment resultTblName=" << resultTblName); +} + +vector JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap, + string const& resultTable) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect start"); + + vector jFragments; + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a"); + if (chunkQuerySpec.nextFragment.get()) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1"); + qproc::ChunkQuerySpec const* sPtr = &chunkQuerySpec; + while (sPtr) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1a"); + LOGS(_log, LOG_LVL_TRACE, "nextFragment"); + for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { // &&& del loop + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1a1"); + LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a2"); + for (auto const& sbi : sPtr->subChunkIds) { // &&& del loop + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a2a"); + LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); + } + // Linked fragments will not have valid subChunkTables vectors, + // So, we reuse the root fragment's vector. + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a3"); + _addFragment(jFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, + sPtr->queries, jobSubQueryTempMap, jobDbTablesMap); + sPtr = sPtr->nextFragment.get(); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a4"); + } else { + LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b1"); + for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { // &&& del loop + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b1a"); + LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b2"); + _addFragment(jFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, + chunkQuerySpec.queries, jobSubQueryTempMap, jobDbTablesMap); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b3"); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect end"); + return jFragments; +} + +void JobFragment::_addFragment(std::vector& jFragments, std::string const& resultTblName, + DbTableSet const& subChunkTables, std::vector const& subchunkIds, + std::vector const& queries, + JobSubQueryTempMap::Ptr const& subQueryTemplates, + JobDbTablesMap::Ptr const& dbTablesMap) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment a"); + Ptr jFrag = Ptr(new JobFragment(subQueryTemplates, dbTablesMap, resultTblName)); + + // queries: The query string is stored in `_jobSubQueryTempMap` and the list of + // integer indexes, `_subQueryTempIndexes`, points back to the specific template. + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b"); + for (auto& qry : queries) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b1"); + int index = jFrag->_jobSubQueryTempMap->findSubQueryTemp(qry); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b2"); + jFrag->_jobSubQueryTempIndexes.push_back(index); + LOGS(_log, LOG_LVL_INFO, jFrag->cName(__func__) << "&&& added frag=" << qry << " index=" << index); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b4"); + } + + // Add the db+table pairs to the subchunks for the fragment. + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c"); + for (auto& tbl : subChunkTables) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c1"); + int index = jFrag->_jobDbTablesMap->findDbTable(make_pair(tbl.db, tbl.table)); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c2"); + jFrag->_jobDbTablesIndexes.push_back(index); + LOGS(_log, LOG_LVL_INFO, + jFrag->cName(__func__) << "&&& added dbtbl=" << tbl.db << "." << tbl.table + << " index=" << index); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment d"); + + // Add subchunk id numbers + for (auto& subchunkId : subchunkIds) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment d1"); + jFrag->_subchunkIds.push_back(subchunkId); + LOGS(_log, LOG_LVL_INFO, jFrag->cName(__func__) << "&&& added subchunkId=" << subchunkId); + } + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment e"); + + jFragments.push_back(move(jFrag)); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment end"); +} + +nlohmann::json JobFragment::serializeJson() const { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson a"); + + json jsFragment = {{"resulttblname", _resultTblName}, + {"subquerytemplate_indexes", _jobSubQueryTempIndexes}, + {"dbtables_indexes", _jobDbTablesIndexes}, + {"subchunkids", _subchunkIds}}; + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson b"); + + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsFragment); + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson end"); + return jsFragment; +} + +JobFragment::Vect JobFragment::createVectFromJson(nlohmann::json const& jsFrags, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, + std::string const& resultTblName) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson " << jsFrags); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson a"); + + JobFragment::Vect jobFragments; + + for (auto const& jsFrag : jsFrags) { + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson b"); + Ptr jobFrag = Ptr(new JobFragment(jobSubQueryTempMap, dbTablesMap, resultTblName)); + + jobFrag->_resultTblName = http::RequestBodyJSON::required(jsFrag, "resulttblname"); + if (jobFrag->_resultTblName != resultTblName) { + // &&& hoping to remove _resultTblName from JobFragment. + LOGS(_log, LOG_LVL_ERROR, + jobFrag->cName(__func__) + " _resultTblName != resultTblName for " + to_string(jsFrag)); + throw util::Bug(ERR_LOC, jobFrag->cName(__func__) + " _resultTblName != resultTblName for " + + to_string(jsFrag)); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson c"); + //&&&std::vector _jobSubQueryTempIndexes; ///< &&& doc + jobFrag->_jobSubQueryTempIndexes = jsFrag["subquerytemplate_indexes"].get>(); + for (int j : jobFrag->_jobSubQueryTempIndexes) { + try { + string tem = jobSubQueryTempMap->getSubQueryTemp(j); + LOGS(_log, LOG_LVL_WARN, jobFrag->cName(__func__) << " &&&T j=" << j << " =" << tem); + } catch (std::out_of_range const& ex) { + LOGS(_log, LOG_LVL_ERROR, + jobFrag->cName(__func__) << " index=" << j << " not found in template map " << jsFrag); + // rethrow as something callers expect. + throw std::invalid_argument(jobFrag->cName(__func__) + " template index=" + to_string(j) + + " " + ex.what()); + } + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson d"); + jobFrag->_jobDbTablesIndexes = jsFrag["dbtables_indexes"].get>(); + for (int j : jobFrag->_jobDbTablesIndexes) { + try { + auto dbTblPr = dbTablesMap->getDbTable(j); + LOGS(_log, LOG_LVL_WARN, + jobFrag->cName(__func__) + << " &&&T j=" << j << " =" << dbTblPr.first << "." << dbTblPr.second); + } catch (std::out_of_range const& ex) { + LOGS(_log, LOG_LVL_ERROR, + jobFrag->cName(__func__) << " index=" << j << " not found in dbTable map " << jsFrag); + // rethrow as something callers expect. + throw std::invalid_argument(jobFrag->cName(__func__) + " dbtable index=" + to_string(j) + + " " + ex.what()); + } + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson e"); + jobFrag->_subchunkIds = jsFrag["subchunkids"].get>(); + jobFragments.push_back(jobFrag); + } + + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson end"); + return jobFragments; +} + +} // namespace lsst::qserv::protojson diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h new file mode 100644 index 000000000..b9dafaa28 --- /dev/null +++ b/src/protojson/UberJobMsg.h @@ -0,0 +1,297 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_PROTOJSON_UBERJOBMSG_H +#define LSST_QSERV_PROTOJSON_UBERJOBMSG_H + +// System headers +#include +#include +#include +#include +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// qserv headers +#include "global/clock_defs.h" +#include "global/DbTable.h" +#include "global/intTypes.h" +#include "protojson/WorkerQueryStatusData.h" + +namespace lsst::qserv::qdisp { +class JobQuery; +} + +namespace lsst::qserv::qproc { +class ChunkQuerySpec; +} + +// This header declarations +namespace lsst::qserv::protojson { + +/// This class is used to store query template strings names in a reasonably +/// concise fashion. +/// The same templates recur frequently, so the individual occurrences +/// will be replaced with an integer index and use this class to recover the +/// original template. +class JobSubQueryTempMap { +public: + using Ptr = std::shared_ptr; + + std::string cName(const char* fName) const { return std::string("JobSubQueryTempMap::") + fName; } + + JobSubQueryTempMap(JobSubQueryTempMap const&) = delete; + + static Ptr create() { return Ptr(new JobSubQueryTempMap()); } + + /// &&& doc + static Ptr createFromJson(nlohmann::json const& ujJson); + + /// Find or insert qTemp into the map and return its index. + int findSubQueryTemp(std::string const& qTemp); + + /// Return the SubQueryTemp string at `index`. + /// @throws std::out_of_range + std::string getSubQueryTemp(int index) { return _qTemplateMap.at(index); } + + nlohmann::json serializeJson() const; + +private: + JobSubQueryTempMap() = default; + + std::map _qTemplateMap; +}; + +/// This class is used to store db.table names in a reasonably concise fashion. +/// The same db+table name pairs recur frequently, so the individual occurrences +/// will be replaced with an integer index and use this class to recover the +/// complete names. +class JobDbTablesMap { +public: + using Ptr = std::shared_ptr; + + std::string cName(const char* fName) const { return std::string("JobDbTablesMap::") + fName; } + + JobDbTablesMap(JobDbTablesMap const&) = delete; + + static Ptr create() { return Ptr(new JobDbTablesMap()); } + + /// &&& doc + static Ptr createFromJson(nlohmann::json const& ujJson); + + /// Find or insert the db.table pair into the map and return its index. + int findDbTable(std::pair const& dbTablePair); + + /// Return the db.table pair at `index`. + /// @throws std::out_of_range + std::pair getDbTable(int index) { return _dbTableMap.at(index); } + + /// &&& doc + void setScanRating(int index, int scanRating, bool lockInMemory); + + /// Return scanRating(int) and lockInMemory(bool) for the dbTable at `index`. + /// TODO:UJ &&& lockInMemory is expected to go away. + std::pair getScanRating(int index) { return _scanRatingMap[index]; } + + nlohmann::json serializeJson() const; + +private: + JobDbTablesMap() = default; + + /// Map of db name and table name pairs: db first, table second. + /// The order in the map is arbitrary, but must be consistent + /// so that lookups using the int index always return the same pair. + std::map> _dbTableMap; + + /// Key is dbTable index, val is scanRating(int) lockInMemory(bool) + std::map> _scanRatingMap; +}; + +/// This class stores the contents of a query fragment, which will be reconstructed +/// and run on a worker to help answer a user query. +class JobFragment { +public: + using Ptr = std::shared_ptr; + using Vect = std::vector; + + std::string cName(const char* fName) const { return std::string("JobFragment::") + fName; } + + JobFragment() = delete; + JobFragment(JobFragment const&) = delete; + + static Vect createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + + /// &&& doc + static Vect createVectFromJson(nlohmann::json const& ujJson, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + + /// Return a json version of the contents of this class. + nlohmann::json serializeJson() const; + +private: + JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap, + std::string const& resultTblName); + + /// &&& doc + static void _addFragment(std::vector& jFragments, std::string const& resultTblName, + DbTableSet const& subChunkTables, std::vector const& subchunkIds, + std::vector const& queries, + JobSubQueryTempMap::Ptr const& subQueryTemplates, + JobDbTablesMap::Ptr const& dbTablesMap); + + JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< &&& doc + std::vector _jobSubQueryTempIndexes; ///< &&& doc + + JobDbTablesMap::Ptr _jobDbTablesMap; ///< &&& doc + std::vector _jobDbTablesIndexes; ///< &&& doc + + std::vector _subchunkIds; ///< &&& doc + + std::string _resultTblName; ///< &&& doc &&& probably not needed here. Replace with + ///< JobMsg::_chunkResultName field. +}; + +/// This class is used to store the information for a single Job (the queries and metadata +/// required to collect rows from a single chunk) in a reasonable manner. +class JobMsg { +public: + using Ptr = std::shared_ptr; + using Vect = std::vector; + std::string cName(const char* fnc) const { return std::string("JobMsg::") + fnc; } + + JobMsg() = delete; + JobMsg(JobMsg const&) = delete; + JobMsg& operator=(JobMsg const&) = delete; + + static Ptr create(std::shared_ptr const& jobs, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap); + + /// &&& doc + static Ptr createFromJson(nlohmann::json const& ujJson, JobSubQueryTempMap::Ptr const& subQueryTemplates, + JobDbTablesMap::Ptr const& dbTablesMap); + + /// Return a json version of the contents of this class. + nlohmann::json serializeJson() const; + +private: + JobMsg(std::shared_ptr const& jobPtr, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap); + + JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap, + JobId jobId, int attemptCount, std::string const& chunkQuerySpecDb, int scanRating, + bool scanInteractive, int chunkId, std::string const& chunkResultName); + + JobId _jobId; + int _attemptCount; + std::string _chunkQuerySpecDb; + int _scanRating; + bool _scanInteractive; + int _chunkId; + std::string _chunkResultName; + JobFragment::Vect _jobFragments; + + JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< Map of all query templates related to this UberJob. + JobDbTablesMap::Ptr _jobDbTablesMap; ///< Map of all db.tables related to this UberJob. + + std::vector _chunkScanTableIndexes; ///< list of indexes into _jobDbTablesMap. +}; + +/// This class stores an UberJob, a collection of Jobs meant for a +/// specific worker, so it can be converted to and from a json format +/// and sent to a worker. +/// There are several fields which are the same for each job, so these +/// values are stored in maps and the individual Jobs and Fragments +/// use integer indexes to reduce the size of the final message. +class UberJobMsg : public std::enable_shared_from_this { +public: + using Ptr = std::shared_ptr; + std::string cName(const char* fnc) const { return std::string("UberJobMsg::") + fnc; } + + UberJobMsg() = delete; + UberJobMsg(UberJobMsg const&) = delete; + UberJobMsg& operator=(UberJobMsg const&) = delete; + + static Ptr create(unsigned int metaVersion, std::string const& replicationInstanceId, + std::string const& replicationAuthKey, CzarContactInfo::Ptr const& czInfo, + WorkerContactInfo::Ptr const& wInfo, QueryId qId, UberJobId ujId, int rowLimit, + int maxTableSizeMB, std::vector> const& jobs) { + return Ptr(new UberJobMsg(metaVersion, replicationInstanceId, replicationAuthKey, czInfo, wInfo->wId, + qId, ujId, rowLimit, maxTableSizeMB, jobs)); + } + + static Ptr createFromJson(nlohmann::json const& ujJson); + + /// Return a json version of the contents of this class. + nlohmann::json serializeJson() const; + + std::string dump() const; + +private: + UberJobMsg(unsigned int metaVersion, std::string const& replicationInstanceId, + std::string const& replicationAuthKey, + //&&&CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, + CzarContactInfo::Ptr const& czInfo, std::string const& workerId, QueryId qId, UberJobId ujId, + int rowLimit, int maxTableSizeMB, std::vector> const& jobs); + + unsigned int _metaVersion; // "version", http::MetaModule::version + // czar + std::string _replicationInstanceId; // "instance_id", czarConfig->replicationInstanceId() + std::string _replicationAuthKey; //"auth_key", czarConfig->replicationAuthKey() + //&&& auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); (string, string, string, + //int) + CzarContactInfo::Ptr _czInfo; + std::string _workerId; // "worker", ciwId + //&&&WorkerContactInfo::Ptr _wInfo; // &&& probably not needed + // &&& {"czarinfo", + //&&&std::string _czarName; // "name", czarConfig->name() + //&&&qmeta::czarId _czarId; // "id", czarConfig->id() + //&&&uint16_t _czarManagementPort; // "management-port", czarConfig->replicationHttpPort() + //&&&std::string _czarManagementHostName; // "management-host-name", util::get_current_host_fqdn() + // &&& } + // &&&{"uberjob", + QueryId _qId; // "queryid", _queryId + UberJobId _ujId; // "uberjobid", _uberJobId + //&&& CzarIdType _czarId; // "czarid", _czarId + int _rowLimit; // "rowlimit", _rowLimit + int _maxTableSizeMB; // &&& Need to add initialization. + + std::vector> _jobs; // &&& needs to be replaced with jobData + // &&& }; + + /// Map of all query templates related to this UberJob. + JobSubQueryTempMap::Ptr _jobSubQueryTempMap{JobSubQueryTempMap::create()}; + + /// Map of all db.tables related to this UberJob. + JobDbTablesMap::Ptr _jobDbTablesMap{JobDbTablesMap::create()}; + + /// List of all job data in this UberJob. "jobs", json::array() + JobMsg::Vect _jobMsgVect; +}; + +} // namespace lsst::qserv::protojson + +#endif // LSST_QSERV_PROTOJSON_UBERJOBMSG_H diff --git a/src/protojson/WorkerQueryStatusData.cc b/src/protojson/WorkerQueryStatusData.cc index e8015c8db..ea3916b6f 100644 --- a/src/protojson/WorkerQueryStatusData.cc +++ b/src/protojson/WorkerQueryStatusData.cc @@ -270,6 +270,7 @@ WorkerQueryStatusData::Ptr WorkerQueryStatusData::createFromJson(nlohmann::json LOGS(_log, LOG_LVL_ERROR, "WorkerQueryStatusData::createJson czar or worker info could not be parsed in " << jsWorkerReq); + return nullptr; } auto wqsData = WorkerQueryStatusData::create(wInfo_, czInfo_, replicationInstanceId_, replicationAuthKey_); diff --git a/src/protojson/WorkerQueryStatusData.h b/src/protojson/WorkerQueryStatusData.h index 1e7132dbc..73aebe244 100644 --- a/src/protojson/WorkerQueryStatusData.h +++ b/src/protojson/WorkerQueryStatusData.h @@ -18,8 +18,8 @@ * the GNU General Public License along with this program. If not, * see . */ -#ifndef LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H -#define LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H +#ifndef LSST_QSERV_PROTOJSON_WORKERQUERYSTATUSDATA_H +#define LSST_QSERV_PROTOJSON_WORKERQUERYSTATUSDATA_H // System headers #include @@ -475,4 +475,4 @@ class WorkerUberJobMsg { } // namespace lsst::qserv::protojson -#endif // LSST_QSERV_HTTP_WORKERQUERYSTATUSDATA_H +#endif // LSST_QSERV_PROTOJSON_WORKERQUERYSTATUSDATA_H diff --git a/src/protojson/testUberJobMsg.cc b/src/protojson/testUberJobMsg.cc new file mode 100644 index 000000000..0db8db8c5 --- /dev/null +++ b/src/protojson/testUberJobMsg.cc @@ -0,0 +1,108 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// System headers +#include +#include +#include +#include +#include + +#include "nlohmann/json.hpp" + +// Qserv headers +#include "global/clock_defs.h" +#include "lsst/log/Log.h" +#include "protojson/UberJobMsg.h" + +// Boost unit test header +#define BOOST_TEST_MODULE RequestQuery +#include + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.protojson.testUberJobMsg"); +} + +using namespace std; +namespace test = boost::test_tools; +using namespace lsst::qserv::protojson; + +BOOST_AUTO_TEST_SUITE(Suite) +/* &&& +std::string testA() { + std::string ta = +R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT +`qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS +`filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM +`qcase01`.`Filter` AS `qcase01.Filter` WHERE +(`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; + return ta; +} +*/ + +std::string testA() { + std::string ta = + R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkresultname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT `qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS `filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM `qcase01`.`Filter` AS `qcase01.Filter` WHERE (`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; + return ta; +} + +BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { + string const replicationInstanceId = "repliInstId"; + string const replicationAuthKey = "repliIAuthKey"; + + uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); + //&&&uint64_t wkrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 10s); + + string const czrName("czar_name"); + lsst::qserv::CzarIdType const czrId = 32; + int czrPort = 2022; + string const czrHost("cz_host"); + LOGS(_log, LOG_LVL_WARN, "&&& testUJM a"); + auto czarA = + lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); + + LOGS(_log, LOG_LVL_WARN, "&&& testUJM b"); + string jsStr = testA(); + nlohmann::json js = nlohmann::json::parse(jsStr); + UberJobMsg::Ptr ujm = UberJobMsg::createFromJson(js); + BOOST_REQUIRE(ujm != nullptr); + LOGS(_log, LOG_LVL_WARN, "&&& testUJM c"); + + nlohmann::json jsUjm = ujm->serializeJson(); + LOGS(_log, LOG_LVL_WARN, "&&& testUJM d"); + + LOGS(_log, LOG_LVL_INFO, "js=" << js); + LOGS(_log, LOG_LVL_INFO, "jsUjm=" << jsUjm); + + UberJobMsg::Ptr ujmCreated = UberJobMsg::createFromJson(jsUjm); + LOGS(_log, LOG_LVL_INFO, "ujmCreated=" << ujmCreated); + nlohmann::json jsUjmCreated = ujmCreated->serializeJson(); + + bool createdMatchesOriginal = jsUjm == jsUjmCreated; + if (!createdMatchesOriginal) { + LOGS(_log, LOG_LVL_ERROR, "jsUjm != jsUjmCreated"); + LOGS(_log, LOG_LVL_ERROR, "jsUjm=" << jsUjm); + LOGS(_log, LOG_LVL_ERROR, "jsUjmCreated=" << jsUjmCreated); + } + BOOST_REQUIRE(createdMatchesOriginal); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index dca19f52c..e5786c8f5 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -89,10 +89,11 @@ bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr } // build the request - auto js = _taskMsgFactory->makeMsgJson(*_chunkQuerySpec, _chunkResultName, _queryId, _jobId, + auto js = _taskMsgFactory->makeMsgJson(*_chunkQuerySpec, _chunkResultName, _queryId, + _jobId, // &&& should be able to delete this _attemptCount, _czarId); LOGS(_log, LOG_LVL_DEBUG, "JobDescription::" << __func__ << " js=" << (*js)); - _jsForWorker = js; + _jsForWorker = js; // &&& should be able to delete _jsForWorker return true; } diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index a3a208c1d..10a9f13ba 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -82,6 +82,8 @@ class JobDescription { ResourceUnit const& resource() const { return _resource; } std::shared_ptr respHandler() { return _respHandler; } int getAttemptCount() const { return _attemptCount; } + std::shared_ptr getChunkQuerySpec() { return _chunkQuerySpec; } + std::string getChunkResultName() { return _chunkResultName; } bool getScanInteractive() const; int getScanRating() const; diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index ffedb593d..8768458f7 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -37,6 +37,7 @@ #include "http/Client.h" #include "http/MetaModule.h" #include "proto/worker.pb.h" +#include "protojson/UberJobMsg.h" #include "qdisp/JobQuery.h" #include "qmeta/JobStatus.h" #include "util/Bug.h" @@ -98,10 +99,12 @@ bool UberJob::addJob(JobQuery::Ptr const& job) { void UberJob::runUberJob() { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest start"); // Build the uberjob payload for each job. nlohmann::json uj; unique_lock jobsLock(_jobsMtx); auto exec = _executive.lock(); +#if 1 // &&& for (auto const& jqPtr : _jobs) { jqPtr->getDescription()->incrAttemptCountScrubResultsJson(exec, true); } @@ -112,6 +115,7 @@ void UberJob::runUberJob() { string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/queryjob"; vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); + // See xrdsvc::httpWorkerCzarModule::_handleQueryJob for json message parsing. json request = {{"version", http::MetaModule::version}, {"instance_id", czarConfig->replicationInstanceId()}, @@ -144,6 +148,44 @@ void UberJob::runUberJob() { jsJobs.push_back(jsJob); jbPtr->getDescription()->resetJsForWorker(); // no longer needed. } +#else // &&& + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest a"); + // Send the uberjob to the worker + auto const method = http::Method::POST; + auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest b"); + string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/queryjob"; + vector const headers = {"Content-Type: application/json"}; + auto const& czarConfig = cconfig::CzarConfig::instance(); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest c"); + + int maxTableSizeMB = czarConfig->getMaxTableSizeMB(); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest d"); + auto czInfo = protojson::CzarContactInfo::create( + czarConfig->name(), czarConfig->id(), czarConfig->replicationHttpPort(), + util::get_current_host_fqdn(), czar::Czar::czarStartupTime); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest e"); + auto uberJobMsg = protojson::UberJobMsg::create( + http::MetaModule::version, czarConfig->replicationInstanceId(), czarConfig->replicationAuthKey(), + czInfo, _wContactInfo, _queryId, _uberJobId, _rowLimit, maxTableSizeMB, _jobs); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest f"); + json request = uberJobMsg->serializeJson(); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest g"); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest=" << request); + + { // &&& testing only, delete + auto parsedReq = protojson::UberJobMsg::createFromJson(request); + json jsParsedReq = parsedReq->serializeJson(); + if (request == jsParsedReq) { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& YAY!!! "); + } else { + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& request != jsParsedReq"); + LOGS(_log, LOG_LVL_ERROR, "&&& request=" << request); + LOGS(_log, LOG_LVL_ERROR, "&&& jsParsedReq=" << jsParsedReq); + } + } + +#endif // &&& jobsLock.unlock(); // unlock so other _jobsMtx threads can advance while this waits for transmit LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " REQ " << request); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 08b153682..609f89e82 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -109,7 +109,8 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { json jsRet; vector ujTasks; try { - // See qdisp::UberJob::runUberJob() for json message construction. +#if 1 // &&& + // See qdisp::UberJob::runUberJob() for json message construction. auto const& jsReq = body().objJson; string const targetWorkerId = body().required("worker"); @@ -198,6 +199,11 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { } scanInfo.scanRating = jdScanPriority; } +#else // &&& + auto const& jsReq = body().objJson; + auto uberJobMsg = protojson::UberJobMsg::createFromJson(jsReq); + // && fill in values +#endif //&&& ujData->setScanInteractive(jdScanInteractive); From d51fa7dc5faa943f18208d67407b860b03ca824a Mon Sep 17 00:00:00 2001 From: John Gates Date: Thu, 5 Dec 2024 10:18:49 -0800 Subject: [PATCH 17/22] Reworked the UberJob json message. --- src/ccontrol/UserQuerySelect.cc | 1 + src/proto/CMakeLists.txt | 1 - src/protojson/CMakeLists.txt | 1 + src/{proto => protojson}/ScanTableInfo.cc | 56 ++++++- src/{proto => protojson}/ScanTableInfo.h | 31 ++-- src/protojson/UberJobMsg.cc | 81 +++++++---- src/protojson/UberJobMsg.h | 73 ++++++++-- src/protojson/testUberJobMsg.cc | 12 +- src/qana/ScanTablePlugin.cc | 18 +-- src/qana/ScanTablePlugin.h | 6 +- src/qdisp/Executive.h | 9 ++ src/qdisp/JobDescription.cc | 2 +- src/qdisp/UberJob.cc | 15 +- src/qproc/ChunkQuerySpec.h | 8 +- src/qproc/QuerySession.cc | 6 +- src/qproc/QuerySession.h | 2 + src/qproc/TaskMsgFactory.cc | 6 +- src/qproc/testQueryAnaGeneral.cc | 6 +- src/query/QueryContext.h | 4 +- src/wbase/Task.cc | 170 +++++++++++++++++++++- src/wbase/Task.h | 32 ++-- src/wdb/testQueryRunner.cc | 14 +- src/wpublish/QueriesAndChunks.cc | 10 +- src/wsched/BlendScheduler.cc | 4 +- src/wsched/ChunkTasksQueue.cc | 2 +- src/wsched/ChunkTasksQueue.h | 2 +- src/wsched/GroupScheduler.cc | 2 +- src/wsched/testSchedulers.cc | 10 +- src/xrdsvc/HttpWorkerCzarModule.cc | 68 +++++++-- src/xrdsvc/SsiService.cc | 10 +- 30 files changed, 510 insertions(+), 152 deletions(-) rename src/{proto => protojson}/ScanTableInfo.cc (64%) rename src/{proto => protojson}/ScanTableInfo.h (75%) diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 41c60ec76..7f1da8353 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -261,6 +261,7 @@ void UserQuerySelect::submit() { } _executive->setScanInteractive(_qSession->getScanInteractive()); + _executive->setScanInfo(_qSession->getScanInfo()); string dbName(""); bool dbNameSet = false; diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 925976832..00616f9e8 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -11,7 +11,6 @@ target_sources(proto PRIVATE ${PROTO_PB_HDRS} FrameBuffer.cc ProtoHeaderWrap.cc - ScanTableInfo.cc ) target_link_libraries(proto PUBLIC diff --git a/src/protojson/CMakeLists.txt b/src/protojson/CMakeLists.txt index 8bb732635..8ac88b4cd 100644 --- a/src/protojson/CMakeLists.txt +++ b/src/protojson/CMakeLists.txt @@ -1,6 +1,7 @@ add_library(protojson SHARED) target_sources(protojson PRIVATE + ScanTableInfo.cc UberJobMsg.cc WorkerQueryStatusData.cc ) diff --git a/src/proto/ScanTableInfo.cc b/src/protojson/ScanTableInfo.cc similarity index 64% rename from src/proto/ScanTableInfo.cc rename to src/protojson/ScanTableInfo.cc index 101e1a8d7..ae7deb254 100644 --- a/src/proto/ScanTableInfo.cc +++ b/src/protojson/ScanTableInfo.cc @@ -22,16 +22,27 @@ */ // Class header -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" // System headers #include #include // Qserv headers +#include "http/RequestBodyJSON.h" #include "util/IterableFormatter.h" -namespace lsst::qserv::proto { +// LSST headers +#include "lsst/log/Log.h" + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.protojson.ScanTableInfo"); +} // namespace + +using namespace std; +using namespace nlohmann; + +namespace lsst::qserv::protojson { /// @return 0 if equal, -1 if this < rhs, 1 if this > rhs int ScanTableInfo::compare(ScanTableInfo const& rhs) const { @@ -87,6 +98,45 @@ void ScanInfo::sortTablesSlowestFirst() { std::sort(infoTables.begin(), infoTables.end(), func); } +nlohmann::json ScanInfo::serializeJson() const { + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson a"); + auto jsScanInfo = json({{"infoscanrating", scanRating}, {"infotables", json::array()}}); + + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson b"); + auto& jsInfoTables = jsScanInfo["infotables"]; + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson c"); + for (auto const& tInfo : infoTables) { + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson c1"); + json jsTInfo = json({{"sidb", tInfo.db}, + {"sitable", tInfo.table}, + {"sirating", tInfo.scanRating}, + {"silockinmem", tInfo.lockInMemory}}); + + jsInfoTables.push_back(jsTInfo); + } + + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson end " << jsScanInfo); + return jsScanInfo; +} + +ScanInfo::Ptr ScanInfo::createFromJson(nlohmann::json const& siJson) { + LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::createFromJson " << siJson); + Ptr siPtr = create(); + auto& iTbls = siPtr->infoTables; + + siPtr->scanRating = http::RequestBodyJSON::required(siJson, "infoscanrating"); + json const& jsTbls = http::RequestBodyJSON::required(siJson, "infotables"); + for (auto const& jsElem : jsTbls) { + auto db = http::RequestBodyJSON::required(jsElem, "sidb"); + auto table = http::RequestBodyJSON::required(jsElem, "sitable"); + auto sRating = http::RequestBodyJSON::required(jsElem, "sirating"); + auto lockInMem = http::RequestBodyJSON::required(jsElem, "silockinmem"); + iTbls.emplace_back(db, table, lockInMem, sRating); + } + + return siPtr; +} + std::ostream& operator<<(std::ostream& os, ScanTableInfo const& tbl) { os << "(db=" << tbl.db << " table=" << tbl.table; os << " lockInMemory=" << tbl.lockInMemory << " scanRating=" << tbl.scanRating << ")"; @@ -98,4 +148,4 @@ std::ostream& operator<<(std::ostream& os, ScanInfo const& info) { return os; } -} // namespace lsst::qserv::proto +} // namespace lsst::qserv::protojson diff --git a/src/proto/ScanTableInfo.h b/src/protojson/ScanTableInfo.h similarity index 75% rename from src/proto/ScanTableInfo.h rename to src/protojson/ScanTableInfo.h index d30e4d04d..061ea0c0f 100644 --- a/src/proto/ScanTableInfo.h +++ b/src/protojson/ScanTableInfo.h @@ -21,21 +21,22 @@ * see . */ -#ifndef LSST_QSERV_PROTO_SCANTABLEINFO_H -#define LSST_QSERV_PROTO_SCANTABLEINFO_H +#ifndef LSST_QSERV_PROTOJSON_SCANTABLEINFO_H +#define LSST_QSERV_PROTOJSON_SCANTABLEINFO_H // System headers +#include #include #include -// Qserv headers -#include "proto/worker.pb.h" +// Third party headers +#include "nlohmann/json.hpp" -namespace lsst::qserv::proto { +namespace lsst::qserv::protojson { /// Structure to store shared scan information for a single table. /// -struct ScanTableInfo { // TODO:UJ check if still useful +struct ScanTableInfo { using ListOf = std::vector; ScanTableInfo() = default; @@ -53,13 +54,25 @@ struct ScanTableInfo { // TODO:UJ check if still useful int scanRating{0}; }; -struct ScanInfo { +/// This class stores information about database table ratings for +/// a user query. +class ScanInfo { +public: + using Ptr = std::shared_ptr; + /// Threshold priority values. Scan priorities are not limited to these values. enum Rating { FASTEST = 0, FAST = 10, MEDIUM = 20, SLOW = 30, SLOWEST = 100 }; ScanInfo() = default; ScanInfo(ScanInfo const&) = default; + static Ptr create() { return Ptr(new ScanInfo()); } + + static Ptr createFromJson(nlohmann::json const& ujJson); + + /// Return a json version of the contents of this class. + nlohmann::json serializeJson() const; + void sortTablesSlowestFirst(); int compareTables(ScanInfo const& rhs); @@ -70,6 +83,6 @@ struct ScanInfo { std::ostream& operator<<(std::ostream& os, ScanTableInfo const& tbl); std::ostream& operator<<(std::ostream& os, ScanInfo const& info); -} // namespace lsst::qserv::proto +} // namespace lsst::qserv::protojson -#endif // LSST_QSERV_PROTO_SCANTABLEINFO_H +#endif // LSST_QSERV_PROTOJSON_SCANTABLEINFO_H diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc index b19d4c1e2..e92631417 100644 --- a/src/protojson/UberJobMsg.cc +++ b/src/protojson/UberJobMsg.cc @@ -47,22 +47,20 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.protojson.UberJobMsg"); namespace lsst::qserv::protojson { UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationInstanceId, - std::string const& replicationAuthKey, - //&&&CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, - CzarContactInfo::Ptr const& czInfo, string const& workerId, QueryId qId, - UberJobId ujId, int rowLimit, int maxTableSizeMB, + std::string const& replicationAuthKey, CzarContactInfo::Ptr const& czInfo, + string const& workerId, QueryId qId, UberJobId ujId, int rowLimit, int maxTableSizeMB, + ScanInfo::Ptr const& scanInfo_, std::vector> const& jobs) : _metaVersion(metaVersion), _replicationInstanceId(replicationInstanceId), _replicationAuthKey(replicationAuthKey), _czInfo(czInfo), _workerId(workerId), - //&&&_workerId(wInfo->wId), - //&&&_wInfo(wInfo), _qId(qId), _ujId(ujId), _rowLimit(rowLimit), - _maxTableSizeMB(maxTableSizeMB) { + _maxTableSizeMB(maxTableSizeMB), + _scanInfo(scanInfo_) { //&&&_jobs(jobs) { LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg start"); @@ -70,7 +68,7 @@ UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationI LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg loop"); // This creates the JobMsg objects for all relates jobs and their fragments. auto jobMsg = JobMsg::create(jobPtr, _jobSubQueryTempMap, _jobDbTablesMap); - _jobMsgVect.push_back(jobMsg); + _jobMsgVect->push_back(jobMsg); } LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg end"); } @@ -89,12 +87,13 @@ json UberJobMsg::serializeJson() const { {"subqueries_map", _jobSubQueryTempMap->serializeJson()}, {"dbtables_map", _jobDbTablesMap->serializeJson()}, {"maxtablesizemb", _maxTableSizeMB}, + {"scaninfo", _scanInfo->serializeJson()}, {"jobs", json::array()}}; LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson b"); auto& jsJobs = ujmJson["jobs"]; LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c"); - for (auto const& jbMsg : _jobMsgVect) { + for (auto const& jbMsg : *_jobMsgVect) { LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c1"); json jsJob = jbMsg->serializeJson(); jsJobs.push_back(jsJob); @@ -122,6 +121,14 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { return nullptr; } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b-b"); + auto scanInfo_ = ScanInfo::createFromJson(ujmJson["scaninfo"]); + if (scanInfo_ == nullptr) { + LOGS(_log, LOG_LVL_ERROR, + "UberJobMsg::createFromJson scanInfo could not be parsed in " << ujmJson); + return nullptr; + } + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson c"); auto metaVersion = http::RequestBodyJSON::required(ujmJson, "version"); LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson d"); @@ -150,7 +157,7 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { std::vector> emptyJobs; Ptr ujmPtr = Ptr(new UberJobMsg(metaVersion, replicationInstanceId, replicationAuthKey, czInfo, - workerId, qId, ujId, rowLimit, maxTableSizeMB, emptyJobs)); + workerId, qId, ujId, rowLimit, maxTableSizeMB, scanInfo_, emptyJobs)); LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson m"); auto const& jsSubQueriesMap = http::RequestBodyJSON::required(ujmJson, "subqueries_map"); @@ -167,7 +174,7 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson q1"); JobMsg::Ptr jobMsgPtr = JobMsg::createFromJson(jsUjJob, ujmPtr->_jobSubQueryTempMap, ujmPtr->_jobDbTablesMap); - ujmPtr->_jobMsgVect.push_back(jobMsgPtr); + ujmPtr->_jobMsgVect->push_back(jobMsgPtr); } LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson end"); @@ -211,7 +218,7 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, _chunkQuerySpecDb = chunkQuerySpec->db; //&&&{"scanPriority", chunkQuerySpec.scanInfo.scanRating}, LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg d"); - _scanRating = chunkQuerySpec->scanInfo.scanRating; + _scanRating = chunkQuerySpec->scanInfo->scanRating; //&&&{"scanInteractive", chunkQuerySpec.scanInteractive}, LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg e"); _scanInteractive = chunkQuerySpec->scanInteractive; @@ -227,7 +234,7 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, // Add scan tables (&&& not sure is this is the same for all jobs or not) LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h"); - for (auto const& sTbl : chunkQuerySpec->scanInfo.infoTables) { + for (auto const& sTbl : chunkQuerySpec->scanInfo->infoTables) { LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h1"); /* &&& nlohmann::json cst = {{"db", sTbl.db}, @@ -280,7 +287,7 @@ nlohmann::json JobMsg::serializeJson() const { LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson d"); auto& jsqFrags = jsJobMsg["queryFragments"]; LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e"); - for (auto& jFrag : _jobFragments) { + for (auto& jFrag : *_jobFragments) { LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e1"); auto jsFrag = jFrag->serializeJson(); LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e2"); @@ -525,13 +532,13 @@ JobFragment::JobFragment(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::JobFragment resultTblName=" << resultTblName); } -vector JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& jobDbTablesMap, - string const& resultTable) { +JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& jobDbTablesMap, + string const& resultTable) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect start"); - vector jFragments; + VectPtr jFragments{new Vect()}; LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a"); if (chunkQuerySpec.nextFragment.get()) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1"); @@ -551,7 +558,7 @@ vector JobFragment::createVect(qproc::ChunkQuerySpec const& ch // Linked fragments will not have valid subChunkTables vectors, // So, we reuse the root fragment's vector. LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a3"); - _addFragment(jFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, + _addFragment(*jFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, sPtr->queries, jobSubQueryTempMap, jobDbTablesMap); sPtr = sPtr->nextFragment.get(); } @@ -564,7 +571,7 @@ vector JobFragment::createVect(qproc::ChunkQuerySpec const& ch LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); } LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b2"); - _addFragment(jFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, + _addFragment(*jFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, chunkQuerySpec.queries, jobSubQueryTempMap, jobDbTablesMap); LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b3"); } @@ -613,11 +620,31 @@ void JobFragment::_addFragment(std::vector& jFragments, std::string const& LOGS(_log, LOG_LVL_INFO, jFrag->cName(__func__) << "&&& added subchunkId=" << subchunkId); } LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment e"); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment " << jFrag->dump()); + LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment ee"); jFragments.push_back(move(jFrag)); LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment end"); } +string JobFragment::dump() const { + stringstream os; + os << "JobFragment resultTbl=" << _resultTblName << " templateIndexes={"; + for (int j : _jobSubQueryTempIndexes) { + os << j << ", "; + } + os << "} subchunkIds={"; + for (int j : _subchunkIds) { + os << j << ", "; + } + os << "} dbtbl={"; + for (int j : _subchunkIds) { + os << j << ", "; + } + os << "}"; + return os.str(); +} + nlohmann::json JobFragment::serializeJson() const { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson a"); @@ -633,14 +660,14 @@ nlohmann::json JobFragment::serializeJson() const { return jsFragment; } -JobFragment::Vect JobFragment::createVectFromJson(nlohmann::json const& jsFrags, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, - std::string const& resultTblName) { +JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFrags, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, + std::string const& resultTblName) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson " << jsFrags); LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson a"); - JobFragment::Vect jobFragments; + JobFragment::VectPtr jobFragments{new JobFragment::Vect()}; for (auto const& jsFrag : jsFrags) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson b"); @@ -690,7 +717,7 @@ JobFragment::Vect JobFragment::createVectFromJson(nlohmann::json const& jsFrags, LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson e"); jobFrag->_subchunkIds = jsFrag["subchunkids"].get>(); - jobFragments.push_back(jobFrag); + jobFragments->push_back(jobFrag); } LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson end"); diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h index b9dafaa28..51dbc24c6 100644 --- a/src/protojson/UberJobMsg.h +++ b/src/protojson/UberJobMsg.h @@ -21,6 +21,8 @@ #ifndef LSST_QSERV_PROTOJSON_UBERJOBMSG_H #define LSST_QSERV_PROTOJSON_UBERJOBMSG_H +#define NEWMSGUJ 0 // &&& delete + // System headers #include #include @@ -36,6 +38,7 @@ #include "global/clock_defs.h" #include "global/DbTable.h" #include "global/intTypes.h" +#include "protojson/ScanTableInfo.h" #include "protojson/WorkerQueryStatusData.h" namespace lsst::qserv::qdisp { @@ -86,7 +89,7 @@ class JobSubQueryTempMap { /// The same db+table name pairs recur frequently, so the individual occurrences /// will be replaced with an integer index and use this class to recover the /// complete names. -class JobDbTablesMap { +class JobDbTablesMap { // &&& this class can probably be deleted public: using Ptr = std::shared_ptr; @@ -133,24 +136,33 @@ class JobFragment { public: using Ptr = std::shared_ptr; using Vect = std::vector; + using VectPtr = std::shared_ptr; std::string cName(const char* fName) const { return std::string("JobFragment::") + fName; } JobFragment() = delete; JobFragment(JobFragment const&) = delete; - static Vect createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + static VectPtr createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); /// &&& doc - static Vect createVectFromJson(nlohmann::json const& ujJson, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + static VectPtr createVectFromJson(nlohmann::json const& ujJson, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap, + std::string const& resultTblName); /// Return a json version of the contents of this class. nlohmann::json serializeJson() const; + std::vector const& getJobSubQueryTempIndexes() const { return _jobSubQueryTempIndexes; } + std::vector const& getJobDbTablesIndexes() const { return _jobDbTablesIndexes; } + std::vector const& getSubchunkIds() const { return _subchunkIds; } + std::string const& getResultTblName() const { return _resultTblName; } + + std::string dump() const; + private: JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); @@ -180,6 +192,7 @@ class JobMsg { public: using Ptr = std::shared_ptr; using Vect = std::vector; + using VectPtr = std::shared_ptr; std::string cName(const char* fnc) const { return std::string("JobMsg::") + fnc; } JobMsg() = delete; @@ -197,6 +210,18 @@ class JobMsg { /// Return a json version of the contents of this class. nlohmann::json serializeJson() const; + JobId getJobId() const { return _jobId; } + int getAttemptCount() const { return _attemptCount; } + std::string getChunkQuerySpecDb() const { return _chunkQuerySpecDb; } + int getScanRating() const { return _scanRating; } + bool getScanInteractive() const { return _scanInteractive; } + int getChunkId() const { return _chunkId; } + std::string getChunkResultName() const { return _chunkResultName; } + + std::vector const& getChunkScanTableIndexes() const { return _chunkScanTableIndexes; } + + JobFragment::VectPtr getJobFragments() const { return _jobFragments; } + private: JobMsg(std::shared_ptr const& jobPtr, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap); @@ -212,7 +237,7 @@ class JobMsg { bool _scanInteractive; int _chunkId; std::string _chunkResultName; - JobFragment::Vect _jobFragments; + JobFragment::VectPtr _jobFragments{new JobFragment::Vect()}; JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< Map of all query templates related to this UberJob. JobDbTablesMap::Ptr _jobDbTablesMap; ///< Map of all db.tables related to this UberJob. @@ -238,9 +263,10 @@ class UberJobMsg : public std::enable_shared_from_this { static Ptr create(unsigned int metaVersion, std::string const& replicationInstanceId, std::string const& replicationAuthKey, CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, QueryId qId, UberJobId ujId, int rowLimit, - int maxTableSizeMB, std::vector> const& jobs) { + int maxTableSizeMB, ScanInfo::Ptr const& scanInfo_, + std::vector> const& jobs) { return Ptr(new UberJobMsg(metaVersion, replicationInstanceId, replicationAuthKey, czInfo, wInfo->wId, - qId, ujId, rowLimit, maxTableSizeMB, jobs)); + qId, ujId, rowLimit, maxTableSizeMB, scanInfo_, jobs)); } static Ptr createFromJson(nlohmann::json const& ujJson); @@ -248,6 +274,20 @@ class UberJobMsg : public std::enable_shared_from_this { /// Return a json version of the contents of this class. nlohmann::json serializeJson() const; + QueryId getQueryId() const { return _qId; } + UberJobId getUberJobId() const { return _ujId; } + int getRowLimit() const { return _rowLimit; } + std::string getWorkerId() const { return _workerId; } + int getMaxTableSizeMb() const { return _maxTableSizeMB; } + + CzarContactInfo::Ptr getCzarContactInfo() const { return _czInfo; } + JobSubQueryTempMap::Ptr getJobSubQueryTempMap() const { return _jobSubQueryTempMap; } + JobDbTablesMap::Ptr getJobDbTablesMap() const { return _jobDbTablesMap; } + + JobMsg::VectPtr getJobMsgVect() const { return _jobMsgVect; } + + ScanInfo::Ptr getScanInfo() const { return _scanInfo; } + std::string dump() const; private: @@ -255,14 +295,13 @@ class UberJobMsg : public std::enable_shared_from_this { std::string const& replicationAuthKey, //&&&CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, CzarContactInfo::Ptr const& czInfo, std::string const& workerId, QueryId qId, UberJobId ujId, - int rowLimit, int maxTableSizeMB, std::vector> const& jobs); + int rowLimit, int maxTableSizeMB, ScanInfo::Ptr const& scanInfo_, + std::vector> const& jobs); unsigned int _metaVersion; // "version", http::MetaModule::version // czar std::string _replicationInstanceId; // "instance_id", czarConfig->replicationInstanceId() std::string _replicationAuthKey; //"auth_key", czarConfig->replicationAuthKey() - //&&& auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); (string, string, string, - //int) CzarContactInfo::Ptr _czInfo; std::string _workerId; // "worker", ciwId //&&&WorkerContactInfo::Ptr _wInfo; // &&& probably not needed @@ -277,9 +316,9 @@ class UberJobMsg : public std::enable_shared_from_this { UberJobId _ujId; // "uberjobid", _uberJobId //&&& CzarIdType _czarId; // "czarid", _czarId int _rowLimit; // "rowlimit", _rowLimit - int _maxTableSizeMB; // &&& Need to add initialization. + int _maxTableSizeMB; // - std::vector> _jobs; // &&& needs to be replaced with jobData + //&&&std::vector> _jobs; // &&& needs to be replaced with jobData // &&& }; /// Map of all query templates related to this UberJob. @@ -289,7 +328,9 @@ class UberJobMsg : public std::enable_shared_from_this { JobDbTablesMap::Ptr _jobDbTablesMap{JobDbTablesMap::create()}; /// List of all job data in this UberJob. "jobs", json::array() - JobMsg::Vect _jobMsgVect; + JobMsg::VectPtr _jobMsgVect{new JobMsg::Vect()}; + + ScanInfo::Ptr _scanInfo{ScanInfo::create()}; ///< &&& NEED to add to serialize and createFromJson }; } // namespace lsst::qserv::protojson diff --git a/src/protojson/testUberJobMsg.cc b/src/protojson/testUberJobMsg.cc index 0db8db8c5..a56c77175 100644 --- a/src/protojson/testUberJobMsg.cc +++ b/src/protojson/testUberJobMsg.cc @@ -46,21 +46,17 @@ namespace test = boost::test_tools; using namespace lsst::qserv::protojson; BOOST_AUTO_TEST_SUITE(Suite) -/* &&& +#if 0 //&&& std::string testA() { std::string ta = -R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT -`qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS -`filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM -`qcase01`.`Filter` AS `qcase01.Filter` WHERE -(`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; + R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkresultname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT `qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS `filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM `qcase01`.`Filter` AS `qcase01.Filter` WHERE (`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; return ta; } -*/ +#endif // &&& std::string testA() { std::string ta = - R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkresultname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT `qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS `filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM `qcase01`.`Filter` AS `qcase01.Filter` WHERE (`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; + R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"scaninfo":{"infoscanrating":0,"infotables":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkresultname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT `qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS `filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM `qcase01`.`Filter`AS`qcase01.Filter` WHERE (`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; return ta; } diff --git a/src/qana/ScanTablePlugin.cc b/src/qana/ScanTablePlugin.cc index 8c3fcde00..cf1ecc4df 100644 --- a/src/qana/ScanTablePlugin.cc +++ b/src/qana/ScanTablePlugin.cc @@ -42,7 +42,6 @@ // Qserv headers #include "czar/Czar.h" #include "global/stringTypes.h" -#include "proto/ScanTableInfo.h" #include "query/ColumnRef.h" #include "query/FromList.h" #include "query/QueryContext.h" @@ -67,8 +66,8 @@ void ScanTablePlugin::applyLogical(query::SelectStmt& stmt, query::QueryContext& void ScanTablePlugin::applyFinal(query::QueryContext& context) { int const scanThreshold = _interactiveChunkLimit; if (context.chunkCount < scanThreshold) { - context.scanInfo.infoTables.clear(); - context.scanInfo.scanRating = 0; + context.scanInfo->infoTables.clear(); + context.scanInfo->scanRating = 0; LOGS(_log, LOG_LVL_INFO, "ScanInfo Squash full table scan tables: <" << scanThreshold << " chunks."); } } @@ -95,7 +94,8 @@ StringPairVector filterPartitioned(query::TableRefList const& tList) { return vector; } -proto::ScanInfo ScanTablePlugin::_findScanTables(query::SelectStmt& stmt, query::QueryContext& context) { +protojson::ScanInfo::Ptr ScanTablePlugin::_findScanTables(query::SelectStmt& stmt, + query::QueryContext& context) { // Might be better as a separate plugin // All tables of a query are scan tables if the statement both: @@ -202,15 +202,15 @@ proto::ScanInfo ScanTablePlugin::_findScanTables(query::SelectStmt& stmt, query: // Ask css if any of the tables should be locked in memory and their scan rating. // Use this information to determine scanPriority. - proto::ScanInfo scanInfo; + auto scanInfo = protojson::ScanInfo::create(); for (auto& pair : scanTables) { - proto::ScanTableInfo info(pair.first, pair.second); + protojson::ScanTableInfo info(pair.first, pair.second); css::ScanTableParams const params = context.css->getScanTableParams(info.db, info.table); info.lockInMemory = params.lockInMem; info.scanRating = params.scanRating; - scanInfo.infoTables.push_back(info); - scanInfo.scanRating = std::max(scanInfo.scanRating, info.scanRating); - scanInfo.scanRating = std::min(scanInfo.scanRating, static_cast(proto::ScanInfo::SLOWEST)); + scanInfo->infoTables.push_back(info); + scanInfo->scanRating = std::max(scanInfo->scanRating, info.scanRating); + scanInfo->scanRating = std::min(scanInfo->scanRating, static_cast(protojson::ScanInfo::SLOWEST)); LOGS(_log, LOG_LVL_INFO, "ScanInfo " << info.db << "." << info.table << " lockInMemory=" << info.lockInMemory << " rating=" << info.scanRating); diff --git a/src/qana/ScanTablePlugin.h b/src/qana/ScanTablePlugin.h index 145424852..aa069710c 100644 --- a/src/qana/ScanTablePlugin.h +++ b/src/qana/ScanTablePlugin.h @@ -27,7 +27,7 @@ #include "qana/QueryPlugin.h" // Qserv headers -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" namespace lsst::qserv::qana { @@ -55,8 +55,8 @@ class ScanTablePlugin : public QueryPlugin { std::string name() const override { return "ScanTablePlugin"; } private: - proto::ScanInfo _findScanTables(query::SelectStmt& stmt, query::QueryContext& context); - proto::ScanInfo _scanInfo; + protojson::ScanInfo::Ptr _findScanTables(query::SelectStmt& stmt, query::QueryContext& context); + protojson::ScanInfo::Ptr _scanInfo; int _interactiveChunkLimit; }; diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index 12a8e4fc4..c9bcba0ff 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -39,6 +39,7 @@ #include "global/intTypes.h" #include "global/ResourceUnit.h" #include "global/stringTypes.h" +#include "protojson/ScanTableInfo.h" #include "qdisp/JobDescription.h" #include "qdisp/ResponseHandler.h" #include "qdisp/UberJob.h" @@ -228,6 +229,12 @@ class Executive : public std::enable_shared_from_this { /// incomplete UberJobs need to be stopped and possibly reassigned. void killIncompleteUberJobsOnWorker(std::string const& workerId); + // Try to remove this and put in constructor + void setScanInfo(protojson::ScanInfo::Ptr const& scanInfo) { _scanInfo = scanInfo; } + + /// Return a pointer to _scanInfo. + protojson::ScanInfo::Ptr getScanInfo() { return _scanInfo; } + protected: Executive(ExecutiveConfig const& cfg, std::shared_ptr const& ms, std::shared_ptr const& sharedResources, @@ -337,6 +344,8 @@ class Executive : public std::enable_shared_from_this { /// Flag that is set to true when ready to create and run UberJobs. std::atomic _readyToExecute{false}; + + protojson::ScanInfo::Ptr _scanInfo; ///< &&& doc }; } // namespace qdisp diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index e5786c8f5..42b7cbfa6 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -99,7 +99,7 @@ bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr bool JobDescription::getScanInteractive() const { return _chunkQuerySpec->scanInteractive; } -int JobDescription::getScanRating() const { return _chunkQuerySpec->scanInfo.scanRating; } +int JobDescription::getScanRating() const { return _chunkQuerySpec->scanInfo->scanRating; } ostream& operator<<(ostream& os, JobDescription const& jd) { os << "job(id=" << jd._jobId << " ru=" << jd._resource.path() << " attemptCount=" << jd._attemptCount diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 8768458f7..990154803 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -40,6 +40,7 @@ #include "protojson/UberJobMsg.h" #include "qdisp/JobQuery.h" #include "qmeta/JobStatus.h" +#include "qproc/ChunkQuerySpec.h" #include "util/Bug.h" #include "util/common.h" #include "util/QdispPool.h" @@ -104,7 +105,7 @@ void UberJob::runUberJob() { nlohmann::json uj; unique_lock jobsLock(_jobsMtx); auto exec = _executive.lock(); -#if 1 // &&& +#if NEWMSGUJ // &&& for (auto const& jqPtr : _jobs) { jqPtr->getDescription()->incrAttemptCountScrubResultsJson(exec, true); } @@ -153,26 +154,22 @@ void UberJob::runUberJob() { // Send the uberjob to the worker auto const method = http::Method::POST; auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest b"); string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/queryjob"; vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest c"); int maxTableSizeMB = czarConfig->getMaxTableSizeMB(); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest d"); auto czInfo = protojson::CzarContactInfo::create( czarConfig->name(), czarConfig->id(), czarConfig->replicationHttpPort(), util::get_current_host_fqdn(), czar::Czar::czarStartupTime); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest e"); + auto scanInfoPtr = exec->getScanInfo(); + auto uberJobMsg = protojson::UberJobMsg::create( http::MetaModule::version, czarConfig->replicationInstanceId(), czarConfig->replicationAuthKey(), - czInfo, _wContactInfo, _queryId, _uberJobId, _rowLimit, maxTableSizeMB, _jobs); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest f"); + czInfo, _wContactInfo, _queryId, _uberJobId, _rowLimit, maxTableSizeMB, scanInfoPtr, _jobs); json request = uberJobMsg->serializeJson(); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest g"); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest=" << request); + LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest=" << request); { // &&& testing only, delete auto parsedReq = protojson::UberJobMsg::createFromJson(request); json jsParsedReq = parsedReq->serializeJson(); diff --git a/src/qproc/ChunkQuerySpec.h b/src/qproc/ChunkQuerySpec.h index a8e7cdc64..d7ad75984 100644 --- a/src/qproc/ChunkQuerySpec.h +++ b/src/qproc/ChunkQuerySpec.h @@ -39,7 +39,7 @@ // Qserv headers #include "global/DbTable.h" #include "global/stringTypes.h" -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" namespace lsst::qserv::qproc { @@ -52,14 +52,16 @@ class ChunkQuerySpec { using Ptr = std::shared_ptr; ChunkQuerySpec() {} - ChunkQuerySpec(std::string const& db_, int chunkId_, proto::ScanInfo const& scanInfo_, + //&&&ChunkQuerySpec(std::string const& db_, int chunkId_, protojson::ScanInfo const& scanInfo_, + ChunkQuerySpec(std::string const& db_, int chunkId_, protojson::ScanInfo::Ptr const& scanInfo_, bool scanInteractive_) : db(db_), chunkId(chunkId_), scanInfo(scanInfo_), scanInteractive(scanInteractive_) {} // Contents could change std::string db{""}; ///< dominant db int chunkId{0}; - proto::ScanInfo scanInfo; ///< shared-scan candidates + //&&&protojson::ScanInfo scanInfo; ///< shared-scan candidates + protojson::ScanInfo::Ptr scanInfo; ///< shared-scan candidates // Consider saving subChunkTable templates, and substituting the chunkIds // and subChunkIds into them on-the-fly. bool scanInteractive{false}; diff --git a/src/qproc/QuerySession.cc b/src/qproc/QuerySession.cc index b5fda17ae..969409a4d 100644 --- a/src/qproc/QuerySession.cc +++ b/src/qproc/QuerySession.cc @@ -363,8 +363,8 @@ void QuerySession::print(std::ostream& os) const { os << " needs merge: " << this->needsMerge(); os << " 1st parallel statement: \"" << par << "\""; os << " merge statement: \"" << mer << "\""; - os << " scanRating:" << _context->scanInfo.scanRating; - for (auto const& tbl : _context->scanInfo.infoTables) { + os << " scanRating:" << _context->scanInfo->scanRating; + for (auto const& tbl : _context->scanInfo->infoTables) { os << " ScanTable: " << tbl.db << "." << tbl.table << " lock=" << tbl.lockInMemory << " rating=" << tbl.scanRating; } @@ -402,6 +402,8 @@ std::ostream& operator<<(std::ostream& out, QuerySession const& querySession) { return out; } +protojson::ScanInfo::Ptr QuerySession::getScanInfo() const { return _context->scanInfo; } + ChunkQuerySpec::Ptr QuerySession::buildChunkQuerySpec(query::QueryTemplate::Vect const& queryTemplates, ChunkSpec const& chunkSpec, bool fillInChunkIdTag) const { diff --git a/src/qproc/QuerySession.h b/src/qproc/QuerySession.h index a85634267..a368abc06 100644 --- a/src/qproc/QuerySession.h +++ b/src/qproc/QuerySession.h @@ -175,6 +175,8 @@ class QuerySession { void setScanInteractive(); bool getScanInteractive() const { return _scanInteractive; } + protojson::ScanInfo::Ptr getScanInfo() const; + /** * Print query session to stream. * diff --git a/src/qproc/TaskMsgFactory.cc b/src/qproc/TaskMsgFactory.cc index bf2018669..a5dd4a97a 100644 --- a/src/qproc/TaskMsgFactory.cc +++ b/src/qproc/TaskMsgFactory.cc @@ -79,8 +79,8 @@ std::shared_ptr TaskMsgFactory::makeMsgJson(ChunkQuerySpec const {"jobId", jobId}, {"attemptCount", attemptCount}, {"querySpecDb", chunkQuerySpec.db}, - {"scanPriority", chunkQuerySpec.scanInfo.scanRating}, - {"scanInteractive", chunkQuerySpec.scanInteractive}, + {"scanPriority", chunkQuerySpec.scanInfo->scanRating}, //&&& del ??? + {"scanInteractive", chunkQuerySpec.scanInteractive}, //&&& del ??? {"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, {"chunkScanTables", nlohmann::json::array()}, {"chunkId", chunkQuerySpec.chunkId}, @@ -89,7 +89,7 @@ std::shared_ptr TaskMsgFactory::makeMsgJson(ChunkQuerySpec const auto& jsJobMsg = *jsJobMsgPtr; auto& chunkScanTables = jsJobMsg["chunkScanTables"]; - for (auto const& sTbl : chunkQuerySpec.scanInfo.infoTables) { + for (auto const& sTbl : chunkQuerySpec.scanInfo->infoTables) { //&&& probably redundant nlohmann::json cst = {{"db", sTbl.db}, {"table", sTbl.table}, {"lockInMemory", sTbl.lockInMemory}, diff --git a/src/qproc/testQueryAnaGeneral.cc b/src/qproc/testQueryAnaGeneral.cc index cced49a07..94dfcbadb 100644 --- a/src/qproc/testQueryAnaGeneral.cc +++ b/src/qproc/testQueryAnaGeneral.cc @@ -745,9 +745,9 @@ BOOST_AUTO_TEST_CASE(SimpleScan) { BOOST_CHECK_EQUAL(context->dominantDb, std::string("LSST")); BOOST_CHECK(nullptr == context->secIdxRestrictors); BOOST_CHECK(nullptr == context->areaRestrictors); - BOOST_CHECK_EQUAL(context->scanInfo.infoTables.size(), 1U); - if (context->scanInfo.infoTables.size() >= 1) { - auto p = context->scanInfo.infoTables.front(); + BOOST_CHECK_EQUAL(context->scanInfo->infoTables.size(), 1U); + if (context->scanInfo->infoTables.size() >= 1) { + auto p = context->scanInfo->infoTables.front(); BOOST_CHECK_EQUAL(p.db, "LSST"); BOOST_CHECK_EQUAL(p.table, "Object"); } diff --git a/src/query/QueryContext.h b/src/query/QueryContext.h index a0a2ae942..a263bafb9 100644 --- a/src/query/QueryContext.h +++ b/src/query/QueryContext.h @@ -38,7 +38,7 @@ // Local headers #include "css/CssAccess.h" #include "global/stringTypes.h" -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" #include "qana/QueryMapping.h" #include "query/FromList.h" #include "query/typedefs.h" @@ -83,7 +83,7 @@ class QueryContext { std::shared_ptr databaseModels; ///< contains database schema information. - proto::ScanInfo scanInfo; // Tables scanned (for shared scans) + protojson::ScanInfo::Ptr scanInfo{protojson::ScanInfo::create()}; // Tables scanned (for shared scans) /** * @brief Add a TableRef to the list of tables used by this query. diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 5252b3936..c7efe8272 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -49,6 +49,7 @@ #include "http/RequestBodyJSON.h" #include "mysql/MySqlConfig.h" #include "proto/worker.pb.h" +#include "protojson/UberJobMsg.h" #include "util/Bug.h" #include "util/common.h" #include "util/HoldTrack.h" @@ -122,7 +123,7 @@ atomic taskSequence{0}; ///< Unique identifier source for Task. /// the util::CommandThreadPool is not called here. Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, string const& db, - proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSize, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSize, vector const& fragSubTables, vector const& fragSubchunkIds, shared_ptr const& sc, std::shared_ptr const& queryStats_, uint16_t resultsHttpPort) @@ -166,7 +167,7 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun if (!_fragmentHasSubchunks) { /// FUTURE: Why acquire anything if there are no subchunks in the fragment? /// This branch never seems to happen, but this needs to be proven beyond any doubt. - for (auto const& scanTbl : scanInfo.infoTables) { + for (auto const& scanTbl : scanInfo->infoTables) { dbTbls_.emplace(scanTbl.db, scanTbl.table); LOGS(_log, LOG_LVL_INFO, "Task::Task scanTbl.db=" << scanTbl.db << " scanTbl.table=" << scanTbl.table); @@ -196,8 +197,8 @@ Task::~Task() {} std::vector Task::createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, - bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& sendChannel, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, std::shared_ptr const& chunkResourceMgr, mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort) { @@ -271,6 +272,33 @@ std::vector Task::createTasksForChunk( if (fragSubchunkIds.empty()) { bool const noSubchunks = false; int const subchunkId = -1; + { + ostringstream os; + os << "&&&TEST00 "; + os << " &&&TEST"; + os << "; ujData?"; + os << "; jobId=" << jdJobId; + os << "; attemptCount=" << jdAttemptCount; + os << "; chunkId=" << jdChunkId; + os << "; fragmentNumber=" << fragmentNumber; + os << "; templateId=" << templateId; + os << "; noSubchunks=" << noSubchunks; + os << "; subchunkId=" << subchunkId; + os << "; chunkQuerySpecDb?"; + os << "; scanInfo=" << *scanInfo; + os << "; scanInteractive=" << scanInteractive; + os << "; maxTableSizeMb=" << maxTableSizeMb; + os << "; fragSubTables={"; + for (auto const& fsTbl : fragSubTables) { + os << fsTbl.db << "." << fsTbl.tbl << ", "; + } + os << "}"; + os << "; fragSubchunkIds=" << util::printable(fragSubchunkIds); + os << "; sendChannel?"; + os << "; queryStats?"; + os << "; resultsHttpPort=" << resultsHttpPort; + LOGS(_log, LOG_LVL_WARN, "&&&" << os.str()); + } auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, @@ -280,6 +308,33 @@ std::vector Task::createTasksForChunk( } else { for (auto subchunkId : fragSubchunkIds) { bool const hasSubchunks = true; + { + ostringstream os; + os << "&&&TEST01 "; + os << " &&&TEST"; + os << "; ujData?"; + os << "; jobId=" << jdJobId; + os << "; attemptCount=" << jdAttemptCount; + os << "; chunkId=" << jdChunkId; + os << "; fragmentNumber=" << fragmentNumber; + os << "; templateId=" << templateId; + os << "; noSubchunks=" << hasSubchunks; + os << "; subchunkId=" << subchunkId; + os << "; chunkQuerySpecDb?"; + os << "; scanInfo=" << *scanInfo; + os << "; scanInteractive=" << scanInteractive; + os << "; maxTableSizeMb=" << maxTableSizeMb; + os << "; fragSubTables={"; + for (auto const& fsTbl : fragSubTables) { + os << fsTbl.db << "." << fsTbl.tbl << ", "; + } + os << "}"; + os << "; fragSubchunkIds=" << util::printable(fragSubchunkIds); + os << "; sendChannel?"; + os << "; queryStats?"; + os << "; resultsHttpPort=" << resultsHttpPort; + LOGS(_log, LOG_LVL_WARN, "&&&" << os.str()); + } auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, hasSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, @@ -301,11 +356,114 @@ std::vector Task::createTasksForChunk( return vect; } +std::vector Task::createTasksFromUberJobMsg( + std::shared_ptr const& ujMsg, std::shared_ptr const& ujData, + std::shared_ptr const& sendChannel, + std::shared_ptr const& chunkResourceMgr, mysql::MySqlConfig const& mySqlConfig, + std::shared_ptr const& sqlConnMgr, + std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort) { + QueryId qId = ujData->getQueryId(); + UberJobId ujId = ujData->getUberJobId(); + CzarIdType czId = ujData->getCzarId(); + + vector vect; // List of created tasks to be returned. + + wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); + UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); + + string funcN(__func__); + funcN += " QID=" + to_string(qId) + " "; + + if (ujMsg->getQueryId() != qId) { + throw util::Bug(ERR_LOC, "Task::createTasksFromUberJobMsg qId(" + to_string(qId) + + ") did not match ujMsg->qId(" + to_string(ujMsg->getQueryId()) + + ")"); + } + if (ujMsg->getUberJobId() != ujId) { + throw util::Bug(ERR_LOC, "Task::createTasksFromUberJobMsg ujId(" + to_string(ujId) + + ") did not match ujMsg->qId(" + to_string(ujMsg->getUberJobId()) + + ")"); + } + + std::string workerId = ujMsg->getWorkerId(); + auto jobSubQueryTempMap = ujMsg->getJobSubQueryTempMap(); + auto jobDbTablesMap = ujMsg->getJobDbTablesMap(); + auto jobMsgVect = ujMsg->getJobMsgVect(); + int maxTableSizeMb = ujMsg->getMaxTableSizeMb(); + auto scanInfo = ujMsg->getScanInfo(); + + for (auto const& jobMsg : *jobMsgVect) { + JobId jobId = jobMsg->getJobId(); + int attemptCount = jobMsg->getAttemptCount(); + std::string chunkQuerySpecDb = jobMsg->getChunkQuerySpecDb(); + bool scanInteractive = jobMsg->getScanInteractive(); + int chunkId = jobMsg->getChunkId(); + std::string chunkResultName = jobMsg->getChunkResultName(); + + std::vector chunkScanTableIndexes = jobMsg->getChunkScanTableIndexes(); + auto jobFragments = jobMsg->getJobFragments(); + int fragmentNumber = 0; + for (auto const& fMsg : *jobFragments) { + // These need to be constructed for the fragment + vector fragSubQueries; + vector fragSubTables; + vector fragSubchunkIds; + + vector fsqIndexes = fMsg->getJobSubQueryTempIndexes(); + for (int fsqIndex : fsqIndexes) { + string fsqStr = jobSubQueryTempMap->getSubQueryTemp(fsqIndex); + fragSubQueries.push_back(fsqStr); + } + + vector dbTblIndexes = fMsg->getJobDbTablesIndexes(); + for (int dbTblIndex : dbTblIndexes) { + auto [scDb, scTable] = jobDbTablesMap->getDbTable(dbTblIndex); + TaskDbTbl scDbTbl(scDb, scTable); + fragSubTables.push_back(scDbTbl); + } + + fragSubchunkIds = fMsg->getSubchunkIds(); + + for (string const& fragSubQ : fragSubQueries) { + size_t templateId = userQueryInfo->addTemplate(fragSubQ); + if (fragSubchunkIds.empty()) { + bool const noSubchunks = false; + int const subchunkId = -1; + auto task = Task::Ptr(new Task( + ujData, jobId, attemptCount, chunkId, fragmentNumber, templateId, noSubchunks, + subchunkId, chunkQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); + + vect.push_back(task); + } else { + for (auto subchunkId : fragSubchunkIds) { + bool const hasSubchunks = true; + auto task = Task::Ptr(new Task(ujData, jobId, attemptCount, chunkId, fragmentNumber, + templateId, hasSubchunks, subchunkId, chunkQuerySpecDb, + scanInfo, scanInteractive, maxTableSizeMb, + fragSubTables, fragSubchunkIds, sendChannel, + queryStats, resultsHttpPort)); + vect.push_back(task); + } + } + } + ++fragmentNumber; + } + } + + for (auto taskPtr : vect) { + // newQueryRunner sets the `_taskQueryRunner` pointer in `task`. + taskPtr->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(taskPtr, chunkResourceMgr, mySqlConfig, + sqlConnMgr, queriesAndChunks)); + } + return vect; +} + //&&& std::vector Task::createTasksForUnitTest( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, - bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& sendChannel, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, std::shared_ptr const& chunkResourceMgr //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, //&&&std::shared_ptr const& queriesAndChunks, diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 118037edf..8cd661d53 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -42,7 +42,7 @@ #include "global/DbTable.h" #include "global/intTypes.h" #include "memman/MemMan.h" -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" #include "wbase/TaskState.h" #include "util/Histogram.h" #include "util/ThreadPool.h" @@ -52,6 +52,10 @@ namespace lsst::qserv::mysql { class MySqlConfig; } +namespace lsst::qserv::protojson { +class UberJobMsg; +} + namespace lsst::qserv::wbase { class FileChannelShared; } @@ -155,7 +159,7 @@ class Task : public util::CommandForThreadPool { // Unfortunately, this will be much easier if it is done after xrootd method is removed. Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, std::string const& db, - proto::ScanInfo const& scanInfo, bool scanInteractive, int maxTableSizeMb, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, std::vector const& fragSubTables, std::vector const& fragSubchunkIds, std::shared_ptr const& sc, std::shared_ptr const& queryStats_, uint16_t resultsHttpPort = 8080); @@ -165,10 +169,20 @@ class Task : public util::CommandForThreadPool { virtual ~Task(); /// Read json to generate a vector of one or more task for a chunk. - static std::vector createTasksForChunk( + static std::vector createTasksForChunk( /// &&& delete std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, - bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& sendChannel, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& chunkResourceMgr, + mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, + std::shared_ptr const& queriesAndChunks, + uint16_t resultsHttpPort = 8080); + + /// &&& + static std::vector createTasksFromUberJobMsg( + std::shared_ptr const& uberJobMsg, + std::shared_ptr const& ujData, + std::shared_ptr const& sendChannel, std::shared_ptr const& chunkResourceMgr, mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, std::shared_ptr const& queriesAndChunks, @@ -177,8 +191,8 @@ class Task : public util::CommandForThreadPool { //&&& static std::vector createTasksForUnitTest( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, proto::ScanInfo const& scanInfo, - bool scanInteractive, int maxTableSizeMb, + std::shared_ptr const& sendChannel, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, std::shared_ptr const& chunkResourceMgr //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& // sqlConnMgr, @@ -239,7 +253,7 @@ class Task : public util::CommandForThreadPool { int getAttemptCount() const { return _attemptCount; } bool getScanInteractive() { return _scanInteractive; } int64_t getMaxTableSize() const { return _maxTableSize; } - proto::ScanInfo& getScanInfo() { return _scanInfo; } + protojson::ScanInfo::Ptr getScanInfo() { return _scanInfo; } void setOnInteractive(bool val) { _onInteractive = val; } bool getOnInteractive() { return _onInteractive; } bool hasMemHandle() const { return _memHandle != memman::MemMan::HandleType::INVALID; } @@ -359,7 +373,7 @@ class Task : public util::CommandForThreadPool { std::atomic _safeToMoveRunning{false}; ///< false until done with waitForMemMan(). TaskQueryRunner::Ptr _taskQueryRunner; std::weak_ptr _taskScheduler; - proto::ScanInfo _scanInfo; + protojson::ScanInfo::Ptr _scanInfo; bool _scanInteractive; ///< True if the czar thinks this query should be interactive. bool _onInteractive{ false}; ///< True if the scheduler put this task on the interactive (group) scheduler. diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index c4b703a92..a3da9eb5b 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -29,7 +29,7 @@ // Qserv headers #include "mysql/MySqlConfig.h" -#include "proto/ScanTableInfo.h" +#include "protojson/ScanTableInfo.h" #include "proto/worker.pb.h" #include "wbase/FileChannelShared.h" #include "wbase/Task.h" @@ -164,9 +164,9 @@ BOOST_AUTO_TEST_CASE(Simple) { auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); - lsst::qserv::proto::ScanInfo scanInfo; - scanInfo.scanRating = mInfo.scanRating; - scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); + auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); + scanInfo->scanRating = mInfo.scanRating; + scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); vector taskVect = Task::createTasksForChunk(ujData, *msgJson, sChannel, scanInfo, mInfo.scanInteractive, mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); @@ -189,9 +189,9 @@ BOOST_AUTO_TEST_CASE(Output) { auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); - lsst::qserv::proto::ScanInfo scanInfo; - scanInfo.scanRating = mInfo.scanRating; - scanInfo.infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); + auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); + scanInfo->scanRating = mInfo.scanRating; + scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); vector taskVect = Task::createTasksForChunk(ujData, *msgJson, sc, scanInfo, mInfo.scanInteractive, mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); diff --git a/src/wpublish/QueriesAndChunks.cc b/src/wpublish/QueriesAndChunks.cc index c49a09aa1..f29bcc9b7 100644 --- a/src/wpublish/QueriesAndChunks.cc +++ b/src/wpublish/QueriesAndChunks.cc @@ -199,10 +199,10 @@ void QueriesAndChunks::_finishedTaskForChunk(wbase::Task::Ptr const& task, doubl } ul.unlock(); auto iter = res.first->second; - proto::ScanInfo& scanInfo = task->getScanInfo(); + protojson::ScanInfo::Ptr scanInfo = task->getScanInfo(); string tblName; - if (!scanInfo.infoTables.empty()) { - proto::ScanTableInfo& sti = scanInfo.infoTables.at(0); + if (!scanInfo->infoTables.empty()) { + protojson::ScanTableInfo& sti = scanInfo->infoTables.at(0); tblName = ChunkTableStats::makeTableName(sti.db, sti.table); } ChunkTableStats::Ptr tableStats = iter->add(tblName, minutes); @@ -328,8 +328,8 @@ void QueriesAndChunks::examineAll() { } double schedMaxTime = sched->getMaxTimeMinutes(); // Get max time for scheduler // Get the slowest scan table in task. - auto begin = task->getScanInfo().infoTables.begin(); - if (begin == task->getScanInfo().infoTables.end()) { + auto begin = task->getScanInfo()->infoTables.begin(); + if (begin == task->getScanInfo()->infoTables.end()) { continue; } string const& slowestTable = begin->db + ":" + begin->table; diff --git a/src/wsched/BlendScheduler.cc b/src/wsched/BlendScheduler.cc index 3e9babc06..ccb335b97 100644 --- a/src/wsched/BlendScheduler.cc +++ b/src/wsched/BlendScheduler.cc @@ -175,7 +175,7 @@ void BlendScheduler::queCmd(std::vector const& cmds) { if (first) { first = false; - auto const& scanTables = task->getScanInfo().infoTables; + auto const& scanTables = task->getScanInfo()->infoTables; bool interactive = task->getScanInteractive(); if (scanTables.size() <= 0 || interactive) { // If there are no scan tables, no point in putting on a shared scan. @@ -186,7 +186,7 @@ void BlendScheduler::queCmd(std::vector const& cmds) { targSched = _group; } else { onInteractive = false; - int scanPriority = task->getScanInfo().scanRating; + int scanPriority = task->getScanInfo()->scanRating; if (LOG_CHECK_LVL(_log, LOG_LVL_DEBUG)) { ostringstream ss; ss << "Blend chose scan for priority=" << scanPriority << " : "; diff --git a/src/wsched/ChunkTasksQueue.cc b/src/wsched/ChunkTasksQueue.cc index de2a09bbb..be534780e 100644 --- a/src/wsched/ChunkTasksQueue.cc +++ b/src/wsched/ChunkTasksQueue.cc @@ -411,7 +411,7 @@ ChunkTasks::ReadyState ChunkTasks::ready(bool useFlexibleLock) { "ChunkTasks " << _chunkId << " got task for chunk " << chunkId << " " << task->getIdStr()); } std::vector tblVect; - for (auto const& tbl : scanInfo.infoTables) { + for (auto const& tbl : scanInfo->infoTables) { memman::TableInfo ti(tbl.db + "/" + tbl.table, lckOptTbl, lckOptIdx); tblVect.push_back(ti); } diff --git a/src/wsched/ChunkTasksQueue.h b/src/wsched/ChunkTasksQueue.h index 84a6be908..9353464e5 100644 --- a/src/wsched/ChunkTasksQueue.h +++ b/src/wsched/ChunkTasksQueue.h @@ -84,7 +84,7 @@ class ChunkTasks { return false; } // compare scanInfo (slower scans first) - int siComp = x->getScanInfo().compareTables(y->getScanInfo()); + int siComp = x->getScanInfo()->compareTables(*(y->getScanInfo())); return siComp < 0; }; void push(wbase::Task::Ptr const& task); diff --git a/src/wsched/GroupScheduler.cc b/src/wsched/GroupScheduler.cc index 5b5c7da27..2429f7ee3 100644 --- a/src/wsched/GroupScheduler.cc +++ b/src/wsched/GroupScheduler.cc @@ -116,7 +116,7 @@ void GroupScheduler::_queCmd(util::Command::Ptr const& cmd, bool keepInThisGroup } auto uqCount = _incrCountForUserQuery(t->getQueryId(), 1); LOGS(_log, LOG_LVL_DEBUG, - getName() << " queCmd uqCount=" << uqCount << " rating=" << t->getScanInfo().scanRating + getName() << " queCmd uqCount=" << uqCount << " rating=" << t->getScanInfo()->scanRating << " interactive=" << t->getScanInteractive()); util::CommandQueue::_cv.notify_one(); } diff --git a/src/wsched/testSchedulers.cc b/src/wsched/testSchedulers.cc index b28b6b060..a103be41d 100644 --- a/src/wsched/testSchedulers.cc +++ b/src/wsched/testSchedulers.cc @@ -33,8 +33,8 @@ // Qserv headers #include "memman/MemManNone.h" #include "mysql/MySqlConfig.h" -#include "proto/ScanTableInfo.h" #include "proto/worker.pb.h" +#include "protojson/ScanTableInfo.h" #include "util/Command.h" #include "util/EventThread.h" #include "wbase/FileChannelShared.h" @@ -188,10 +188,10 @@ struct SchedFixture { queries->setRequiredTasksCompleted(1); // Make it easy to set a baseline. } - int const fastest = lsst::qserv::proto::ScanInfo::Rating::FASTEST; - int const fast = lsst::qserv::proto::ScanInfo::Rating::FAST; - int const medium = lsst::qserv::proto::ScanInfo::Rating::MEDIUM; - int const slow = lsst::qserv::proto::ScanInfo::Rating::SLOW; + int const fastest = lsst::qserv::protojson::ScanInfo::Rating::FASTEST; + int const fast = lsst::qserv::protojson::ScanInfo::Rating::FAST; + int const medium = lsst::qserv::protojson::ScanInfo::Rating::MEDIUM; + int const slow = lsst::qserv::protojson::ScanInfo::Rating::SLOW; lsst::qserv::QueryId qIdInc{1}; diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 609f89e82..652876ed5 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -37,6 +37,7 @@ #include "http/RequestBodyJSON.h" #include "http/RequestQuery.h" #include "mysql/MySqlUtils.h" +#include "protojson/UberJobMsg.h" #include "protojson/WorkerQueryStatusData.h" #include "qmeta/types.h" #include "util/String.h" @@ -109,8 +110,8 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { json jsRet; vector ujTasks; try { -#if 1 // &&& - // See qdisp::UberJob::runUberJob() for json message construction. +#if NEWMSGUJ // &&& + // See qdisp::UberJob::runUberJob() for json message construction. auto const& jsReq = body().objJson; string const targetWorkerId = body().required("worker"); @@ -157,7 +158,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { // TODO:UJ These items should be stored higher in the message structure as they get // duplicated and should always be the same within an UberJob. QueryId jdQueryId = 0; - proto::ScanInfo scanInfo; + auto scanInfo = protojson::ScanInfo::create(); bool scanInfoSet = false; bool jdScanInteractive = false; int jdMaxTableSize = 0; @@ -192,18 +193,13 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { LOGS(_log, LOG_LVL_TRACE, __func__ << " chunkSDb=" << chunkScanDb << " lockinmem=" << lockInMemory << " csTble=" << chunkScanTable << " tblScanRating=" << tblScanRating); - scanInfo.infoTables.emplace_back(chunkScanDb, chunkScanTable, lockInMemory, - tblScanRating); + scanInfo->infoTables.emplace_back(chunkScanDb, chunkScanTable, lockInMemory, + tblScanRating); scanInfoSet = true; } } - scanInfo.scanRating = jdScanPriority; + scanInfo->scanRating = jdScanPriority; } -#else // &&& - auto const& jsReq = body().objJson; - auto uberJobMsg = protojson::UberJobMsg::createFromJson(jsReq); - // && fill in values -#endif //&&& ujData->setScanInteractive(jdScanInteractive); @@ -216,6 +212,56 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { channelShared->setTaskCount(ujTasks.size()); ujData->addTasks(ujTasks); +#else // &&& + auto const& jsReq = body().objJson; + auto uberJobMsg = protojson::UberJobMsg::createFromJson(jsReq); + + UberJobId ujId = uberJobMsg->getUberJobId(); + auto ujCzInfo = uberJobMsg->getCzarContactInfo(); + auto czarId = ujCzInfo->czId; + QueryId ujQueryId = uberJobMsg->getQueryId(); + int ujRowLimit = uberJobMsg->getRowLimit(); + auto targetWorkerId = uberJobMsg->getWorkerId(); + + // Get or create QueryStatistics and UserQueryInfo instances. + auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzInfo->czId); + auto userQueryInfo = queryStats->getUserQueryInfo(); + + if (userQueryInfo->getCancelledByCzar()) { + throw wbase::TaskException( + ERR_LOC, string("Already cancelled by czar. ujQueryId=") + to_string(ujQueryId)); + } + if (userQueryInfo->isUberJobDead(ujId)) { + throw wbase::TaskException(ERR_LOC, string("UberJob already dead. ujQueryId=") + + to_string(ujQueryId) + " ujId=" + to_string(ujId)); + } + + /* &&& + auto ujData = wbase::UberJobData::create(ujId, czarName, czarId, czarHostName, czarPort, ujQueryId, + ujRowLimit, targetWorkerId, foreman(), authKey()); + */ + auto ujData = wbase::UberJobData::create(ujId, ujCzInfo->czName, ujCzInfo->czId, ujCzInfo->czHostName, + ujCzInfo->czPort, ujQueryId, ujRowLimit, targetWorkerId, + foreman(), authKey()); + + // Find the entry for this queryId, create a new one if needed. + userQueryInfo->addUberJob(ujData); + /* &&& + auto channelShared = + wbase::FileChannelShared::create(ujData, czarId, czarHostName, czarPort, targetWorkerId); + */ + auto channelShared = wbase::FileChannelShared::create(ujData, ujCzInfo->czId, ujCzInfo->czHostName, + ujCzInfo->czPort, targetWorkerId); + + ujData->setFileChannelShared(channelShared); + + auto ujTasks = wbase::Task::createTasksFromUberJobMsg( + uberJobMsg, ujData, channelShared, foreman()->chunkResourceMgr(), foreman()->mySqlConfig(), + foreman()->sqlConnMgr(), foreman()->queriesAndChunks(), foreman()->httpPort()); + channelShared->setTaskCount(ujTasks.size()); + ujData->addTasks(ujTasks); + +#endif //&&& // At this point, it looks like the message was sent successfully, update // czar touched time. diff --git a/src/xrdsvc/SsiService.cc b/src/xrdsvc/SsiService.cc index d2d1507a7..5d0a813ff 100644 --- a/src/xrdsvc/SsiService.cc +++ b/src/xrdsvc/SsiService.cc @@ -190,11 +190,11 @@ SsiService::SsiService(XrdSsiLogger* log) { workerConfig->getMaxGroupSize(), wsched::SchedulerBase::getMaxPriority()); - int const fastest = lsst::qserv::proto::ScanInfo::Rating::FASTEST; - int const fast = lsst::qserv::proto::ScanInfo::Rating::FAST; - int const medium = lsst::qserv::proto::ScanInfo::Rating::MEDIUM; - int const slow = lsst::qserv::proto::ScanInfo::Rating::SLOW; - int const slowest = lsst::qserv::proto::ScanInfo::Rating::SLOWEST; + int const fastest = lsst::qserv::protojson::ScanInfo::Rating::FASTEST; + int const fast = lsst::qserv::protojson::ScanInfo::Rating::FAST; + int const medium = lsst::qserv::protojson::ScanInfo::Rating::MEDIUM; + int const slow = lsst::qserv::protojson::ScanInfo::Rating::SLOW; + int const slowest = lsst::qserv::protojson::ScanInfo::Rating::SLOWEST; double fastScanMaxMinutes = (double)workerConfig->getScanMaxMinutesFast(); double medScanMaxMinutes = (double)workerConfig->getScanMaxMinutesMed(); double slowScanMaxMinutes = (double)workerConfig->getScanMaxMinutesSlow(); From e1fea4bc9c5583e66f5dd662ca7d12244ed394d7 Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 6 Dec 2024 13:40:16 -0800 Subject: [PATCH 18/22] Enabled chunk Id replacement, and added connection pools. --- .../templates/proxy/etc/qserv-czar.cnf.jinja | 8 +- src/cconfig/CzarConfig.h | 15 +- src/ccontrol/UserQuerySelect.cc | 27 +- src/czar/Czar.cc | 25 +- src/czar/Czar.h | 14 +- src/czar/CzarChunkMap.cc | 10 + src/protojson/UberJobMsg.cc | 231 ++++-------------- src/protojson/UberJobMsg.h | 25 +- src/protojson/testUberJobMsg.cc | 63 ++++- src/qana/QueryMapping.h | 2 + src/qdisp/Executive.cc | 1 + src/qdisp/JobDescription.h | 3 +- src/qdisp/UberJob.cc | 54 ++-- src/qproc/ChunkQuerySpec.h | 3 +- src/qproc/ChunkSpec.cc | 4 +- src/qproc/QuerySession.cc | 3 +- src/query/QueryTemplate.cc | 16 ++ src/query/QueryTemplate.h | 2 + src/util/InstanceCount.cc | 6 +- src/wsched/BlendScheduler.cc | 1 + 20 files changed, 279 insertions(+), 234 deletions(-) diff --git a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja index 7991d0ab0..26e13346f 100644 --- a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja +++ b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja @@ -101,17 +101,17 @@ notifyWorkersOnCzarRestart = 1 # Please see util/QdispPool.h QdispPool::QdispPool for more information [qdisppool] #size of the pool -poolSize = 50 +poolSize = 1000 # Low numbers are higher priority. Largest priority 3 creates 4 priority queues 0, 1, 2, 3 # Must be greater than 0. largestPriority = 3 # Maximum number of threads running for each queue. No spaces. Values separated by ':' # Using largestPriority = 2 and vectRunsizes = 3:5:8 # queue 0 would have runSize 3, queue 1 would have runSize 5, and queue 2 would have runSize 8. -vectRunSizes = 50:50:50:50 +vectRunSizes = 800:800:500:500 # Minimum number of threads running for each queue. No spaces. Values separated by ':' -vectMinRunningSizes = 0:1:3:3 -# Maximum number of QueryRequests allowed to be running at one time. +vectMinRunningSizes = 0:3:3:3 +# Maximum number of QueryRequests allowed to be running at one time. &&& unused?? qReqPseudoFifoMaxRunning = 299 [replication] diff --git a/src/cconfig/CzarConfig.h b/src/cconfig/CzarConfig.h index e77878e18..9b6096531 100644 --- a/src/cconfig/CzarConfig.h +++ b/src/cconfig/CzarConfig.h @@ -215,6 +215,9 @@ class CzarConfig { /// The maximum number of chunks (basically Jobs) allowed in a single UberJob. int getUberJobMaxChunks() const { return _uberJobMaxChunks->getVal(); } + /// Return the maximum number of http connections to use for czar commands. + int getCommandMaxHttpConnections() const { return _commandMaxHttpConnections->getVal(); } + // Parameters of the Czar management service std::string const& replicationInstanceId() const { return _replicationInstanceId->getVal(); } @@ -310,7 +313,7 @@ class CzarConfig { CVTIntPtr _resultMaxConnections = util::ConfigValTInt::create(_configValMap, "resultdb", "maxconnections", notReq, 40); CVTIntPtr _resultMaxHttpConnections = - util::ConfigValTInt::create(_configValMap, "resultdb", "maxhttpconnections", notReq, 8192); + util::ConfigValTInt::create(_configValMap, "resultdb", "maxhttpconnections", notReq, 2000); CVTIntPtr _oldestResultKeptDays = util::ConfigValTInt::create(_configValMap, "resultdb", "oldestResultKeptDays", notReq, 30); @@ -361,9 +364,9 @@ class CzarConfig { CVTIntPtr _qdispMaxPriority = util::ConfigValTInt::create(_configValMap, "qdisppool", "largestPriority", notReq, 2); CVTStrPtr _qdispVectRunSizes = - util::ConfigValTStr::create(_configValMap, "qdisppool", "vectRunSizes", notReq, "50:50:50:50"); + util::ConfigValTStr::create(_configValMap, "qdisppool", "vectRunSizes", notReq, "800:800:500:50"); CVTStrPtr _qdispVectMinRunningSizes = - util::ConfigValTStr::create(_configValMap, "qdisppool", "vectMinRunningSizes", notReq, "0:1:3:3"); + util::ConfigValTStr::create(_configValMap, "qdisppool", "vectMinRunningSizes", notReq, "0:3:3:3"); CVTIntPtr _xrootdSpread = util::ConfigValTInt::create(_configValMap, "tuning", "xrootdSpread", notReq, 4); CVTIntPtr _qMetaSecsBetweenChunkCompletionUpdates = util::ConfigValTInt::create( @@ -413,7 +416,11 @@ class CzarConfig { // UberJobs CVTIntPtr _uberJobMaxChunks = - util::ConfigValTInt::create(_configValMap, "uberjob", "maxChunks", notReq, 10); + util::ConfigValTInt::create(_configValMap, "uberjob", "maxChunks", notReq, 1000); + + /// This may impact `_resultMaxHttpConnections` as too many connections may cause kernel memory issues. + CVTIntPtr _commandMaxHttpConnections = + util::ConfigValTInt::create(_configValMap, "uberjob", "commandMaxHttpConnections", notReq, 2000); }; } // namespace lsst::qserv::cconfig diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 7f1da8353..c5e2aef35 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -105,7 +105,9 @@ #include "rproc/InfileMerger.h" #include "sql/Schema.h" #include "util/Bug.h" +#include "util/InstanceCount.h" //&&& #include "util/IterableFormatter.h" +#include "util/Histogram.h" //&&& #include "util/QdispPool.h" #include "util/ThreadPriority.h" #include "qdisp/UberJob.h" @@ -276,7 +278,8 @@ void UserQuerySelect::submit() { qproc::ChunkQuerySpec::Ptr cs; { std::lock_guard lock(chunksMtx); - cs = _qSession->buildChunkQuerySpec(queryTemplates, chunkSpec); + bool fillInChunkIdTag = false; // do not fill in the chunkId + cs = _qSession->buildChunkQuerySpec(queryTemplates, chunkSpec, fillInChunkIdTag); chunks.push_back(cs->chunkId); } std::string chunkResultName = _ttn->make(cs->chunkId); @@ -321,10 +324,15 @@ void UserQuerySelect::submit() { } } +util::HistogramRolling histoBuildAndS("&&&uj histoBuildAndS", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); +util::HistogramRolling histoBuildAndS1("&&&uj histoBuildAndS1", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); + void UserQuerySelect::buildAndSendUberJobs() { + util::InstanceCount ic("UserQuerySelect::buildAndSendUberJobs&&&"); // TODO:UJ Is special handling needed for the dummy chunk, 1234567890 ? string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); LOGS(_log, LOG_LVL_DEBUG, funcN << " start"); + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj start " << _uberJobMaxChunks); // Ensure `_monitor()` doesn't do anything until everything is ready. if (!_executive->isReadyToExecute()) { @@ -333,7 +341,9 @@ void UserQuerySelect::buildAndSendUberJobs() { } // Only one thread should be generating UberJobs for this user query at any given time. + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj lock before"); lock_guard fcLock(_buildUberJobMtx); + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj lock after"); LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << _executive->getTotalJobs()); vector uberJobs; @@ -341,6 +351,7 @@ void UserQuerySelect::buildAndSendUberJobs() { qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = _executive->unassignedChunksInQuery(); if (unassignedChunksInQuery.empty()) { LOGS(_log, LOG_LVL_TRACE, funcN << " no unassigned Jobs"); + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj no unassigned Jobs"); return; } @@ -380,6 +391,7 @@ void UserQuerySelect::buildAndSendUberJobs() { map> workerJobMap; vector missingChunks; + auto startassign = CLOCK::now(); //&&& // unassignedChunksInQuery needs to be in numerical order so that UberJobs contain chunk numbers in // numerical order. The workers run shared scans in numerical order of chunk id numbers. // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, @@ -441,10 +453,15 @@ void UserQuerySelect::buildAndSendUberJobs() { } auto& ujVectBack = ujVect.back(); ujVectBack->addJob(jqPtr); - LOGS(_log, LOG_LVL_DEBUG, + LOGS(_log, LOG_LVL_TRACE, funcN << " ujVectBack{" << ujVectBack->getIdStr() << " jobCnt=" << ujVectBack->getJobCount() << "}"); } + auto endassign = CLOCK::now(); //&&& + std::chrono::duration secsassign = endassign - startassign; // &&& + histoBuildAndS.addEntry(endassign, secsassign.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoBuildAndS.getString("")); + auto startwcont = CLOCK::now(); //&&& if (!missingChunks.empty()) { string errStr = funcN + " a worker could not be found for these chunks "; @@ -454,6 +471,7 @@ void UserQuerySelect::buildAndSendUberJobs() { errStr += " they will be retried later."; LOGS(_log, LOG_LVL_ERROR, errStr); } + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj waitForWorkerContactMap"); // Add worker contact info to UberJobs. The czar can't do anything without // the contact map, so it will wait. This should only ever be an issue at startup. @@ -475,7 +493,12 @@ void UserQuerySelect::buildAndSendUberJobs() { _executive->queueUberJob(ujPtr); } } + auto endwcont = CLOCK::now(); //&&& + std::chrono::duration secswcont = endwcont - startwcont; // &&& + histoBuildAndS1.addEntry(endwcont, secswcont.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoBuildAndS1.getString("")); LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); + LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj " << _executive->dumpUberJobCounts()); } /// Block until a submit()'ed query completes. diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index db70bcbfe..fe62f34e3 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -51,6 +51,7 @@ #include "czar/CzarRegistry.h" #include "global/LogContext.h" #include "http/Client.h" +#include "http/ClientConnPool.h" #include "http/MetaModule.h" #include "http/Method.h" #include "proto/worker.pb.h" @@ -89,13 +90,21 @@ Czar::Ptr Czar::createCzar(string const& configFilePath, string const& czarName) void Czar::_monitor() { string const funcN("Czar::_monitor"); + uint16_t loopCount = 0; // unsigned to wrap around while (_monitorLoop) { + ++loopCount; this_thread::sleep_for(_monitorSleepTime); LOGS(_log, LOG_LVL_DEBUG, funcN << " start0"); /// Check database for changes in worker chunk assignments and aliveness try { - _czarFamilyMap->read(); + // TODO:UJ The read() is incredibly expensive until the database has + // a "changed" field of some kind (preferably timestamp) to + // indicate the last time it changed. + // For Now, just do one read every few times through this loop. + if (loopCount % 10 == 0 || true) { + _czarFamilyMap->read(); + } } catch (ChunkMapException const& cmex) { // There are probably chunks that don't exist on any alive worker, // continue on in hopes that workers will show up with the missing chunks @@ -104,8 +113,7 @@ void Czar::_monitor() { } // Send appropriate messages to all ActiveWorkers. This will - // check if workers have died by timeout. The response - // from the worker include + // check if workers have died by timeout. _czarRegistry->sendActiveWorkersMessages(); /// Create new UberJobs (if possible) for all jobs that are @@ -193,10 +201,10 @@ Czar::Czar(string const& configFilePath, string const& czarName) string vectMinRunningSizesStr = _czarConfig->getQdispVectMinRunningSizes(); vector vectMinRunningSizes = util::String::parseToVectInt(vectMinRunningSizesStr, ":", 0); LOGS(_log, LOG_LVL_INFO, - "INFO qdisp config qPoolSize=" << qPoolSize << " maxPriority=" << maxPriority << " vectRunSizes=" - << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) - << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " - << util::prettyCharList(vectMinRunningSizes)); + " qdisp config qPoolSize=" << qPoolSize << " maxPriority=" << maxPriority << " vectRunSizes=" + << vectRunSizesStr << " -> " << util::prettyCharList(vectRunSizes) + << " vectMinRunningSizes=" << vectMinRunningSizesStr << " -> " + << util::prettyCharList(vectMinRunningSizes)); _qdispPool = make_shared(qPoolSize, maxPriority, vectRunSizes, vectMinRunningSizes); qdisp::CzarStats::setup(_qdispPool); @@ -208,6 +216,9 @@ Czar::Czar(string const& configFilePath, string const& czarName) LOGS(_log, LOG_LVL_INFO, "config xrootdSpread=" << xrootdSpread); _queryDistributionTestVer = _czarConfig->getQueryDistributionTestVer(); + _commandHttpPool = shared_ptr( + new http::ClientConnPool(_czarConfig->getCommandMaxHttpConnections())); + LOGS(_log, LOG_LVL_INFO, "Creating czar instance with name " << czarName); LOGS(_log, LOG_LVL_INFO, "Czar config: " << *_czarConfig); diff --git a/src/czar/Czar.h b/src/czar/Czar.h index 78b02237a..408df5b10 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -56,6 +56,10 @@ class ActiveWorkerMap; class HttpSvc; } // namespace lsst::qserv::czar +namespace lsst::qserv::http { +class ClientConnPool; +} // namespace lsst::qserv::http + namespace lsst::qserv::util { class FileMonitor; } // namespace lsst::qserv::util @@ -155,6 +159,8 @@ class Czar { std::shared_ptr getQdispPool() const { return _qdispPool; } + std::shared_ptr getCommandHttpPool() const { return _commandHttpPool; } + /// Startup time of czar, sent to workers so they can detect that the czar was /// was restarted when this value changes. static uint64_t const czarStartupTime; @@ -228,7 +234,7 @@ class Czar { std::atomic _monitorLoop{true}; /// Wait time between checks. TODO:UJ set from config - std::chrono::milliseconds _monitorSleepTime{15000}; + std::chrono::milliseconds _monitorSleepTime{15'000}; // &&& config /// Keeps track of all workers (alive or otherwise) that this czar /// may communicate with. Once created, the pointer never changes. @@ -236,7 +242,7 @@ class Czar { /// A combined priority queue and thread pool to regulate czar communications /// with workers. Once created, the pointer never changes. - /// TODO:UJ - It would be better to have a pool for each worker as it + /// TODO:UJ - It may be better to have a pool for each worker as it /// may be possible for a worker to have communications /// problems in a way that would wedge the pool. This can /// probably be done fairly easily by having pools @@ -244,6 +250,10 @@ class Czar { /// This was not possible in xrootd as the czar had /// no reasonable way to know where Jobs were going. std::shared_ptr _qdispPool; + + /// Pool of http client connections for sending commands (UberJobs + /// and worker status requests). + std::shared_ptr _commandHttpPool; }; } // namespace lsst::qserv::czar diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index c064f60d1..17dc0c277 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -35,6 +35,8 @@ #include "czar/CzarRegistry.h" #include "qmeta/Exceptions.h" #include "util/Bug.h" +#include "util/InstanceCount.h" //&&& +#include "util/Histogram.h" //&&& #include "util/TimeUtils.h" using namespace std; @@ -357,9 +359,13 @@ bool CzarFamilyMap::_read() { return true; } +util::HistogramRolling histoMakeNewMaps("&&&uj histoMakeNewMaps", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); + std::shared_ptr CzarFamilyMap::makeNewMaps( qmeta::QMetaChunkMap const& qChunkMap) { // Create new maps. + util::InstanceCount ic("CzarFamilyMap::makeNewMaps&&&"); + auto startMakeMaps = CLOCK::now(); //&&& std::shared_ptr newFamilyMap = make_shared(); // Workers -> Databases map @@ -413,6 +419,10 @@ std::shared_ptr CzarFamilyMap::makeNewMaps( } } + auto endMakeMaps = CLOCK::now(); //&&& + std::chrono::duration secsMakeMaps = endMakeMaps - startMakeMaps; // &&& + histoMakeNewMaps.addEntry(endMakeMaps, secsMakeMaps.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoMakeNewMaps.getString("")); return newFamilyMap; } diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc index e92631417..8f617103c 100644 --- a/src/protojson/UberJobMsg.cc +++ b/src/protojson/UberJobMsg.cc @@ -61,11 +61,9 @@ UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationI _rowLimit(rowLimit), _maxTableSizeMB(maxTableSizeMB), _scanInfo(scanInfo_) { - //&&&_jobs(jobs) { LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg start"); for (auto& jobPtr : jobs) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg loop"); // This creates the JobMsg objects for all relates jobs and their fragments. auto jobMsg = JobMsg::create(jobPtr, _jobSubQueryTempMap, _jobDbTablesMap); _jobMsgVect->push_back(jobMsg); @@ -89,24 +87,19 @@ json UberJobMsg::serializeJson() const { {"maxtablesizemb", _maxTableSizeMB}, {"scaninfo", _scanInfo->serializeJson()}, {"jobs", json::array()}}; - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson b"); auto& jsJobs = ujmJson["jobs"]; - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c"); for (auto const& jbMsg : *_jobMsgVect) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson c1"); - json jsJob = jbMsg->serializeJson(); - jsJobs.push_back(jsJob); + //&&&json jsJob = jbMsg->serializeJson(); + //&&&jsJobs.push_back(jsJob); + jsJobs.emplace_back(jbMsg->serializeJson()); } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson d"); - - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& ujmJson=" << ujmJson); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " &&& ujmJson=" << ujmJson); return ujmJson; } UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson a"); LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson ujmJson=" << ujmJson); try { if (ujmJson["version"] != http::MetaModule::version) { @@ -114,14 +107,13 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { return nullptr; } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b"); auto czInfo_ = CzarContactInfo::createFromJson(ujmJson["czarinfo"]); if (czInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson czar could not be parsed in " << ujmJson); return nullptr; } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b-b"); + LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b"); auto scanInfo_ = ScanInfo::createFromJson(ujmJson["scaninfo"]); if (scanInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, @@ -129,28 +121,18 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { return nullptr; } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson c"); auto metaVersion = http::RequestBodyJSON::required(ujmJson, "version"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson d"); auto replicationInstanceId = http::RequestBodyJSON::required(ujmJson, "instance_id"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson e"); auto replicationAuthKey = http::RequestBodyJSON::required(ujmJson, "auth_key"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson f"); auto workerId = http::RequestBodyJSON::required(ujmJson, "worker"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson g"); auto qId = http::RequestBodyJSON::required(ujmJson, "queryid"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson h"); auto ujId = http::RequestBodyJSON::required(ujmJson, "uberjobid"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson i"); auto rowLimit = http::RequestBodyJSON::required(ujmJson, "rowlimit"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson j"); auto maxTableSizeMB = http::RequestBodyJSON::required(ujmJson, "maxtablesizemb"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson k"); auto czInfo = CzarContactInfo::createFromJson(ujmJson["czarinfo"]); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson l"); auto jsUjJobs = http::RequestBodyJSON::required(ujmJson, "jobs"); - LOGS(_log, LOG_LVL_INFO, + LOGS(_log, LOG_LVL_TRACE, " &&& " << metaVersion << replicationInstanceId << replicationAuthKey << workerId << qId << ujId << rowLimit << jsUjJobs); @@ -161,17 +143,12 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson m"); auto const& jsSubQueriesMap = http::RequestBodyJSON::required(ujmJson, "subqueries_map"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson n"); ujmPtr->_jobSubQueryTempMap = JobSubQueryTempMap::createFromJson(jsSubQueriesMap); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson o"); auto jsDbTablesMap = http::RequestBodyJSON::required(ujmJson, "dbtables_map"); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson p"); ujmPtr->_jobDbTablesMap = JobDbTablesMap::createFromJson(jsDbTablesMap); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson q"); for (auto const& jsUjJob : jsUjJobs) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson q1"); JobMsg::Ptr jobMsgPtr = JobMsg::createFromJson(jsUjJob, ujmPtr->_jobSubQueryTempMap, ujmPtr->_jobDbTablesMap); ujmPtr->_jobMsgVect->push_back(jobMsgPtr); @@ -207,91 +184,51 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, if (descr == nullptr) { throw util::Bug(ERR_LOC, cName(__func__) + " description=null for job=" + jobPtr->getIdStr()); } - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg a"); auto chunkQuerySpec = descr->getChunkQuerySpec(); _jobId = descr->id(); - //&&&{"attemptCount", attemptCount}, - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg b"); _attemptCount = descr->getAttemptCount(); // &&& may need to increment descr->AttemptCount at this time. - //&&&{"querySpecDb", chunkQuerySpec.db}, - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg c"); _chunkQuerySpecDb = chunkQuerySpec->db; - //&&&{"scanPriority", chunkQuerySpec.scanInfo.scanRating}, - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg d"); _scanRating = chunkQuerySpec->scanInfo->scanRating; - //&&&{"scanInteractive", chunkQuerySpec.scanInteractive}, - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg e"); _scanInteractive = chunkQuerySpec->scanInteractive; - //&&&{"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, - //_maxTableSizeMB; // &&& move up to UberJob - //&&&{"chunkScanTables", nlohmann::json::array()}, - //&&&{"chunkId", chunkQuerySpec.chunkId}, - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg f"); _chunkId = chunkQuerySpec->chunkId; - //&&&{"queryFragments", nlohmann::json::array()}})); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg g"); _chunkResultName = descr->getChunkResultName(); // Add scan tables (&&& not sure is this is the same for all jobs or not) - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h"); for (auto const& sTbl : chunkQuerySpec->scanInfo->infoTables) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h1"); - /* &&& - nlohmann::json cst = {{"db", sTbl.db}, - {"table", sTbl.table}, - {"lockInMemory", sTbl.lockInMemory}, - {"tblScanRating", sTbl.scanRating}}; - chunkScanTables.push_back(move(cst)); - */ int index = jobDbTablesMap->findDbTable(make_pair(sTbl.db, sTbl.table)); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h2"); jobDbTablesMap->setScanRating(index, sTbl.scanRating, sTbl.lockInMemory); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h3"); _chunkScanTableIndexes.push_back(index); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg h4"); } // Add fragments - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg i"); - _jobFragments = - JobFragment::createVect(*chunkQuerySpec, jobSubQueryTempMap, jobDbTablesMap, _chunkResultName); + _jobFragments = JobFragment::createVect(*chunkQuerySpec, jobSubQueryTempMap, jobDbTablesMap); LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg end"); } nlohmann::json JobMsg::serializeJson() const { LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson a"); - auto jsJobMsg = - nlohmann::json({//&&&{"czarId", czarId}, - //&&&{"queryId", queryId}, - {"jobId", _jobId}, - {"attemptCount", _attemptCount}, - {"querySpecDb", _chunkQuerySpecDb}, - {"scanPriority", _scanRating}, - {"scanInteractive", _scanInteractive}, - //&&&{"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, - //&&&{"chunkScanTables", nlohmann::json::array()}, - {"chunkId", _chunkId}, - {"chunkresultname", _chunkResultName}, - {"chunkscantables_indexes", nlohmann::json::array()}, - {"queryFragments", json::array()}}); + auto jsJobMsg = nlohmann::json({{"jobId", _jobId}, + {"attemptCount", _attemptCount}, + {"querySpecDb", _chunkQuerySpecDb}, + {"scanPriority", _scanRating}, + {"scanInteractive", _scanInteractive}, + {"chunkId", _chunkId}, + {"chunkresultname", _chunkResultName}, + {"chunkscantables_indexes", nlohmann::json::array()}, + {"queryFragments", json::array()}}); // These are indexes into _jobDbTablesMap, which is shared between all JobMsg in this UberJobMsg. - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson b"); + // &&& TODO:UJ queries appear to work even when "chunkscantables_indexes" is wrong auto& jsqCstIndexes = jsJobMsg["chunkscantables_indexes"]; - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson c"); for (auto const& index : _chunkScanTableIndexes) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson c1"); jsqCstIndexes.push_back(index); } - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson d"); auto& jsqFrags = jsJobMsg["queryFragments"]; - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e"); for (auto& jFrag : *_jobFragments) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e1"); - auto jsFrag = jFrag->serializeJson(); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson e2"); - jsqFrags.push_back(jsFrag); + //&&&auto jsFrag = jFrag->serializeJson(); + //&&&jsqFrags.push_back(jsFrag); + jsqFrags.emplace_back(jFrag->serializeJson()); } LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson end"); @@ -314,31 +251,23 @@ JobMsg::JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap JobMsg::Ptr JobMsg::createFromJson(nlohmann::json const& ujJson, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson a"); LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson ujJson=" << ujJson); JobId jobId = http::RequestBodyJSON::required(ujJson, "jobId"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson b"); int attemptCount = http::RequestBodyJSON::required(ujJson, "attemptCount"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson c"); string chunkQuerySpecDb = http::RequestBodyJSON::required(ujJson, "querySpecDb"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson d"); int scanRating = http::RequestBodyJSON::required(ujJson, "scanPriority"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson e"); bool scanInteractive = http::RequestBodyJSON::required(ujJson, "scanInteractive"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson f"); int chunkId = http::RequestBodyJSON::required(ujJson, "chunkId"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson g"); string chunkResultName = http::RequestBodyJSON::required(ujJson, "chunkresultname"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson h"); json jsQFrags = http::RequestBodyJSON::required(ujJson, "queryFragments"); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson i"); Ptr jMsgPtr = Ptr(new JobMsg(jobSubQueryTempMap, jobDbTablesMap, jobId, attemptCount, chunkQuerySpecDb, scanRating, scanInteractive, chunkId, chunkResultName)); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson j"); - jMsgPtr->_jobFragments = JobFragment::createVectFromJson( - jsQFrags, jMsgPtr->_jobSubQueryTempMap, jMsgPtr->_jobDbTablesMap, jMsgPtr->_chunkResultName); + json jsChunkTblIndexes = http::RequestBodyJSON::required(ujJson, "chunkscantables_indexes"); + jMsgPtr->_chunkScanTableIndexes = jsChunkTblIndexes.get>(); + jMsgPtr->_jobFragments = + JobFragment::createVectFromJson(jsQFrags, jMsgPtr->_jobSubQueryTempMap, jMsgPtr->_jobDbTablesMap); LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson end"); return jMsgPtr; @@ -350,20 +279,15 @@ json JobSubQueryTempMap::serializeJson() const { // std::map _qTemplateMap; json jsSubQueryTemplateMap = {{"subquerytemplate_map", json::array()}}; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson b"); - LOGS(_log, LOG_LVL_WARN, + LOGS(_log, LOG_LVL_TRACE, "&&& JobSubQueryTempMap::serializeJson jsSubQueryTemplateMap=" << jsSubQueryTemplateMap); auto& jsSqtMap = jsSubQueryTemplateMap["subquerytemplate_map"]; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c"); for (auto const& [key, templ] : _qTemplateMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c1"); json jsElem = {{"index", key}, {"template", templ}}; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson c2"); jsSqtMap.push_back(jsElem); } - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson e"); - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsSqtMap); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " &&& " << jsSqtMap); LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson end"); return jsSubQueryTemplateMap; @@ -399,24 +323,19 @@ JobSubQueryTempMap::Ptr JobSubQueryTempMap::createFromJson(nlohmann::json const& } int JobSubQueryTempMap::findSubQueryTemp(string const& qTemp) { - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp start"); // The expected number of templates is expected to be small, less than 4, // so this shouldn't be horribly expensive. - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp qTemp=" << qTemp); for (auto const& [key, temp] : _qTemplateMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp key=" << key << " t=" << temp); if (temp == qTemp) { LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end key=" << key); return key; } } - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp endloop"); // Need to insert int index = _qTemplateMap.size(); - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp index=" << index); _qTemplateMap[index] = qTemp; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end"); + LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end index=" << index); return index; } @@ -521,107 +440,75 @@ void JobDbTablesMap::setScanRating(int index, int scanRating, bool lockInMemory) } JobFragment::JobFragment(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& jobDbTablesMap, std::string const& resultTblName) - : _jobSubQueryTempMap(jobSubQueryTempMap), - _jobDbTablesMap(jobDbTablesMap), - _resultTblName(resultTblName) { - LOGS(_log, LOG_LVL_WARN, - "&&& JobFragment::JobFragment _jobSubQueryTempMap!=nullptr=" << (_jobSubQueryTempMap != nullptr)); - LOGS(_log, LOG_LVL_WARN, - "&&& JobFragment::JobFragment _jobDbTablesMap!=nullptr=" << (_jobDbTablesMap != nullptr)); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::JobFragment resultTblName=" << resultTblName); -} + JobDbTablesMap::Ptr const& jobDbTablesMap) + : _jobSubQueryTempMap(jobSubQueryTempMap), _jobDbTablesMap(jobDbTablesMap) {} JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& jobDbTablesMap, - string const& resultTable) { + JobDbTablesMap::Ptr const& jobDbTablesMap) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect start"); - VectPtr jFragments{new Vect()}; - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a"); if (chunkQuerySpec.nextFragment.get()) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1"); qproc::ChunkQuerySpec const* sPtr = &chunkQuerySpec; while (sPtr) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1a"); LOGS(_log, LOG_LVL_TRACE, "nextFragment"); for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { // &&& del loop - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a1a1"); LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a2"); + /* &&& for (auto const& sbi : sPtr->subChunkIds) { // &&& del loop LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a2a"); LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); } + */ // Linked fragments will not have valid subChunkTables vectors, // So, we reuse the root fragment's vector. - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a3"); - _addFragment(*jFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, - sPtr->queries, jobSubQueryTempMap, jobDbTablesMap); + _addFragment(*jFragments, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, sPtr->queries, + jobSubQueryTempMap, jobDbTablesMap); sPtr = sPtr->nextFragment.get(); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a4"); } else { LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b1"); for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { // &&& del loop - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b1a"); LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b2"); - _addFragment(*jFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, + _addFragment(*jFragments, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, chunkQuerySpec.queries, jobSubQueryTempMap, jobDbTablesMap); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect b3"); } LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect end"); return jFragments; } -void JobFragment::_addFragment(std::vector& jFragments, std::string const& resultTblName, - DbTableSet const& subChunkTables, std::vector const& subchunkIds, - std::vector const& queries, +//&&&void JobFragment::_addFragment(std::vector& jFragments, std::string const& resultTblName, +void JobFragment::_addFragment(std::vector& jFragments, DbTableSet const& subChunkTables, + std::vector const& subchunkIds, std::vector const& queries, JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment a"); - Ptr jFrag = Ptr(new JobFragment(subQueryTemplates, dbTablesMap, resultTblName)); + Ptr jFrag = Ptr(new JobFragment(subQueryTemplates, dbTablesMap)); // queries: The query string is stored in `_jobSubQueryTempMap` and the list of // integer indexes, `_subQueryTempIndexes`, points back to the specific template. - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b"); for (auto& qry : queries) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b1"); int index = jFrag->_jobSubQueryTempMap->findSubQueryTemp(qry); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b2"); jFrag->_jobSubQueryTempIndexes.push_back(index); - LOGS(_log, LOG_LVL_INFO, jFrag->cName(__func__) << "&&& added frag=" << qry << " index=" << index); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment b4"); + LOGS(_log, LOG_LVL_TRACE, jFrag->cName(__func__) << "&&& added frag=" << qry << " index=" << index); } // Add the db+table pairs to the subchunks for the fragment. - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c"); for (auto& tbl : subChunkTables) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c1"); int index = jFrag->_jobDbTablesMap->findDbTable(make_pair(tbl.db, tbl.table)); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment c2"); jFrag->_jobDbTablesIndexes.push_back(index); - LOGS(_log, LOG_LVL_INFO, - jFrag->cName(__func__) << "&&& added dbtbl=" << tbl.db << "." << tbl.table - << " index=" << index); + LOGS(_log, LOG_LVL_TRACE, + jFrag->cName(__func__) << " added dbtbl=" << tbl.db << "." << tbl.table << " index=" << index); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment d"); // Add subchunk id numbers for (auto& subchunkId : subchunkIds) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment d1"); jFrag->_subchunkIds.push_back(subchunkId); - LOGS(_log, LOG_LVL_INFO, jFrag->cName(__func__) << "&&& added subchunkId=" << subchunkId); + LOGS(_log, LOG_LVL_TRACE, jFrag->cName(__func__) << " added subchunkId=" << subchunkId); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment e"); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment " << jFrag->dump()); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment ee"); jFragments.push_back(move(jFrag)); LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment end"); @@ -629,7 +516,7 @@ void JobFragment::_addFragment(std::vector& jFragments, std::string const& string JobFragment::dump() const { stringstream os; - os << "JobFragment resultTbl=" << _resultTblName << " templateIndexes={"; + os << " templateIndexes={"; for (int j : _jobSubQueryTempIndexes) { os << j << ", "; } @@ -646,44 +533,24 @@ string JobFragment::dump() const { } nlohmann::json JobFragment::serializeJson() const { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson a"); - - json jsFragment = {{"resulttblname", _resultTblName}, - {"subquerytemplate_indexes", _jobSubQueryTempIndexes}, + json jsFragment = {{"subquerytemplate_indexes", _jobSubQueryTempIndexes}, {"dbtables_indexes", _jobDbTablesIndexes}, {"subchunkids", _subchunkIds}}; - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson b"); - - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsFragment); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::serializeJson end"); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " " << jsFragment); return jsFragment; } JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFrags, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, - std::string const& resultTblName) { + JobDbTablesMap::Ptr const& dbTablesMap) { LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson " << jsFrags); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson a"); JobFragment::VectPtr jobFragments{new JobFragment::Vect()}; for (auto const& jsFrag : jsFrags) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson b"); - Ptr jobFrag = Ptr(new JobFragment(jobSubQueryTempMap, dbTablesMap, resultTblName)); - - jobFrag->_resultTblName = http::RequestBodyJSON::required(jsFrag, "resulttblname"); - if (jobFrag->_resultTblName != resultTblName) { - // &&& hoping to remove _resultTblName from JobFragment. - LOGS(_log, LOG_LVL_ERROR, - jobFrag->cName(__func__) + " _resultTblName != resultTblName for " + to_string(jsFrag)); - throw util::Bug(ERR_LOC, jobFrag->cName(__func__) + " _resultTblName != resultTblName for " + - to_string(jsFrag)); - } + Ptr jobFrag = Ptr(new JobFragment(jobSubQueryTempMap, dbTablesMap)); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson c"); - //&&&std::vector _jobSubQueryTempIndexes; ///< &&& doc jobFrag->_jobSubQueryTempIndexes = jsFrag["subquerytemplate_indexes"].get>(); for (int j : jobFrag->_jobSubQueryTempIndexes) { try { @@ -698,7 +565,6 @@ JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFra } } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson d"); jobFrag->_jobDbTablesIndexes = jsFrag["dbtables_indexes"].get>(); for (int j : jobFrag->_jobDbTablesIndexes) { try { @@ -715,7 +581,6 @@ JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFra } } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson e"); jobFrag->_subchunkIds = jsFrag["subchunkids"].get>(); jobFragments->push_back(jobFrag); } diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h index 51dbc24c6..9203e74ae 100644 --- a/src/protojson/UberJobMsg.h +++ b/src/protojson/UberJobMsg.h @@ -143,6 +143,7 @@ class JobFragment { JobFragment() = delete; JobFragment(JobFragment const&) = delete; + /* &&& static VectPtr createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); @@ -152,6 +153,16 @@ class JobFragment { JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + */ + + static VectPtr createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap); + + /// &&& doc + static VectPtr createVectFromJson(nlohmann::json const& ujJson, + JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, + JobDbTablesMap::Ptr const& dbTablesMap); /// Return a json version of the contents of this class. nlohmann::json serializeJson() const; @@ -159,20 +170,30 @@ class JobFragment { std::vector const& getJobSubQueryTempIndexes() const { return _jobSubQueryTempIndexes; } std::vector const& getJobDbTablesIndexes() const { return _jobDbTablesIndexes; } std::vector const& getSubchunkIds() const { return _subchunkIds; } - std::string const& getResultTblName() const { return _resultTblName; } + //&&&std::string const& getResultTblName() const { return _resultTblName; } std::string dump() const; private: + /* &&& JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); + */ + JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap); /// &&& doc + static void _addFragment(std::vector& jFragments, DbTableSet const& subChunkTables, + std::vector const& subchunkIds, std::vector const& queries, + JobSubQueryTempMap::Ptr const& subQueryTemplates, + JobDbTablesMap::Ptr const& dbTablesMap); + + /* &&& static void _addFragment(std::vector& jFragments, std::string const& resultTblName, DbTableSet const& subChunkTables, std::vector const& subchunkIds, std::vector const& queries, JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap); + */ JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< &&& doc std::vector _jobSubQueryTempIndexes; ///< &&& doc @@ -182,8 +203,10 @@ class JobFragment { std::vector _subchunkIds; ///< &&& doc + /* &&& std::string _resultTblName; ///< &&& doc &&& probably not needed here. Replace with ///< JobMsg::_chunkResultName field. + */ }; /// This class is used to store the information for a single Job (the queries and metadata diff --git a/src/protojson/testUberJobMsg.cc b/src/protojson/testUberJobMsg.cc index a56c77175..e0b056422 100644 --- a/src/protojson/testUberJobMsg.cc +++ b/src/protojson/testUberJobMsg.cc @@ -54,26 +54,65 @@ std::string testA() { } #endif // &&& -std::string testA() { - std::string ta = +string testA() { + string ta = R"({"maxtablesizemb":5432,"auth_key":"replauthkey","czarinfo":{"czar-startup-time":1732658208085,"id":1,"management-host-name":"3a8b68cf9b67","management-port":40865,"name":"proxy"},"dbtables_map":{"dbtable_map":[],"scanrating_map":[]},"scaninfo":{"infoscanrating":0,"infotables":[]},"instance_id":"qserv_proj","jobs":[{"attemptCount":0,"chunkId":1234567890,"chunkresultname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","chunkscantables_indexes":[],"jobId":0,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_1_a0d45001254932466b784acf90323565_1234567890_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"qcase01","scanInteractive":true,"scanPriority":0}],"queryid":1,"rowlimit":0,"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT `qcase01.Filter`.`filterId` AS `filterId`,`qcase01.Filter`.`filterName` AS `filterName`,`qcase01.Filter`.`photClam` AS `photClam`,`qcase01.Filter`.`photBW` AS `photBW` FROM `qcase01`.`Filter`AS`qcase01.Filter` WHERE (`qcase01.Filter`.`filterId`<<1)=2"}]},"uberjobid":2,"version":39,"worker":"6c56ba9b-ac40-11ef-acb7-0242c0a8030a"})"; return ta; } +string testB() { + string tb = + R"({"auth_key":"slac6dev:kukara4a","czarinfo":{"czar-startup-time":1733499789161,"id":7,"management-host-name":"sdfqserv001.sdf.slac.stanford.edu","management-port":41923,"name":"proxy"},"dbtables_map":{"dbtable_map":[{"db":"dp02_dc2_catalogs","index":0,"table":"Object"}],"scanrating_map":[{"index":0,"lockinmem":true,"scanrating":1}]},"instance_id":"slac6dev","jobs":[{"attemptCount":0,"chunkId":79680,"chunkresultname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_79680_0","chunkscantables_indexes":[0],"jobId":1398,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_79680_0","subchunkids":[],"subquerytemplate_indexes":[0]}],"querySpecDb":"dp02_dc2_catalogs","scanInteractive":false,"scanPriority":1},{"attemptCount":0,"chunkId":80358,"chunkresultname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_80358_0","chunkscantables_indexes":[0],"jobId":1435,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_80358_0","subchunkids":[],"subquerytemplate_indexes":[1]}],"querySpecDb":"dp02_dc2_catalogs","scanInteractive":false,"scanPriority":1},{"attemptCount":0,"chunkId":81017,"chunkresultname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_81017_0","chunkscantables_indexes":[0],"jobId":1452,"queryFragments":[{"dbtables_indexes":[],"resulttblname":"r_280607_e6eac6bb53b0f8505ed36bf82a4d93f1_81017_0","subchunkids":[],"subquerytemplate_indexes":[2]}],"querySpecDb":"dp02_dc2_catalogs","scanInteractive":false,"scanPriority":1}],"maxtablesizemb":5100,"queryid":280607,"rowlimit":0,"scaninfo":{"infoscanrating":1,"infotables":[{"sidb":"dp02_dc2_catalogs","silockinmem":true,"sirating":1,"sitable":"Object"}]},"subqueries_map":{"subquerytemplate_map":[{"index":0,"template":"SELECT COUNT(`obj`.`g_ap12Flux`) AS `QS1_COUNT`,SUM(`obj`.`g_ap12Flux`) AS `QS2_SUM`,MIN(`obj`.`g_ap12Flux`) AS `QS3_MIN`,MAX(`obj`.`g_ap12Flux`) AS `QS4_MAX`,COUNT(`obj`.`g_ap12FluxErr`) AS `QS5_COUNT`,SUM(`obj`.`g_ap12FluxErr`) AS `QS6_SUM`,MIN(`obj`.`g_ap12FluxErr`) AS `QS7_MIN`,MAX(`obj`.`g_ap12FluxErr`) AS `QS8_MAX`,COUNT(`obj`.`g_ap25Flux`) AS `QS9_COUNT`,SUM(`obj`.`g_ap25Flux`) AS `QS10_SUM`,MIN(`obj`.`g_ap25Flux`) AS `QS11_MIN`,MAX(`obj`.`g_ap25Flux`) AS `QS12_MAX`,COUNT(`obj`.`g_ap25FluxErr`) AS `QS13_COUNT`,SUM(`obj`.`g_ap25FluxErr`) AS `QS14_SUM`,MIN(`obj`.`g_ap25FluxErr`) AS `QS15_MIN`,MAX(`obj`.`g_ap25FluxErr`) AS `QS16_MAX` FROM `dp02_dc2_catalogs`.`Object_79680` AS `obj`"},{"index":1,"template":"SELECT COUNT(`obj`.`g_ap12Flux`) AS `QS1_COUNT`,SUM(`obj`.`g_ap12Flux`) AS `QS2_SUM`,MIN(`obj`.`g_ap12Flux`) AS `QS3_MIN`,MAX(`obj`.`g_ap12Flux`) AS `QS4_MAX`,COUNT(`obj`.`g_ap12FluxErr`) AS `QS5_COUNT`,SUM(`obj`.`g_ap12FluxErr`) AS `QS6_SUM`,MIN(`obj`.`g_ap12FluxErr`) AS `QS7_MIN`,MAX(`obj`.`g_ap12FluxErr`) AS `QS8_MAX`,COUNT(`obj`.`g_ap25Flux`) AS `QS9_COUNT`,SUM(`obj`.`g_ap25Flux`) AS `QS10_SUM`,MIN(`obj`.`g_ap25Flux`) AS `QS11_MIN`,MAX(`obj`.`g_ap25Flux`) AS `QS12_MAX`,COUNT(`obj`.`g_ap25FluxErr`) AS `QS13_COUNT`,SUM(`obj`.`g_ap25FluxErr`) AS `QS14_SUM`,MIN(`obj`.`g_ap25FluxErr`) AS `QS15_MIN`,MAX(`obj`.`g_ap25FluxErr`) AS `QS16_MAX` FROM `dp02_dc2_catalogs`.`Object_80358` AS `obj`"},{"index":2,"template":"SELECT COUNT(`obj`.`g_ap12Flux`) AS `QS1_COUNT`,SUM(`obj`.`g_ap12Flux`) AS `QS2_SUM`,MIN(`obj`.`g_ap12Flux`) AS `QS3_MIN`,MAX(`obj`.`g_ap12Flux`) AS `QS4_MAX`,COUNT(`obj`.`g_ap12FluxErr`) AS `QS5_COUNT`,SUM(`obj`.`g_ap12FluxErr`) AS `QS6_SUM`,MIN(`obj`.`g_ap12FluxErr`) AS `QS7_MIN`,MAX(`obj`.`g_ap12FluxErr`) AS `QS8_MAX`,COUNT(`obj`.`g_ap25Flux`) AS `QS9_COUNT`,SUM(`obj`.`g_ap25Flux`) AS `QS10_SUM`,MIN(`obj`.`g_ap25Flux`) AS `QS11_MIN`,MAX(`obj`.`g_ap25Flux`) AS `QS12_MAX`,COUNT(`obj`.`g_ap25FluxErr`) AS `QS13_COUNT`,SUM(`obj`.`g_ap25FluxErr`) AS `QS14_SUM`,MIN(`obj`.`g_ap25FluxErr`) AS `QS15_MIN`,MAX(`obj`.`g_ap25FluxErr`) AS `QS16_MAX` FROM `dp02_dc2_catalogs`.`Object_81017` AS `obj`"}]},"uberjobid":147,"version":39,"worker":"db04"})"; + return tb; +} + +bool parseSerializeReparseCheck(string const& jsStr, string const& note) { + string fName("parseSerialize "); + fName += note + " "; + LOGS(_log, LOG_LVL_INFO, fName << " start " << jsStr); + nlohmann::json js = nlohmann::json::parse(jsStr); + LOGS(_log, LOG_LVL_INFO, fName << " parse 1"); + + UberJobMsg::Ptr ujm = UberJobMsg::createFromJson(js); + BOOST_REQUIRE(ujm != nullptr); + + nlohmann::json jsUjm = ujm->serializeJson(); + LOGS(_log, LOG_LVL_INFO, fName << " serialized jsUjm=" << jsUjm); + + UberJobMsg::Ptr ujmCreated = UberJobMsg::createFromJson(jsUjm); + LOGS(_log, LOG_LVL_INFO, fName << " created"); + nlohmann::json jsUjmCreated = ujmCreated->serializeJson(); + LOGS(_log, LOG_LVL_INFO, fName << " created->serialized"); + + bool createdMatchesOriginal = jsUjm == jsUjmCreated; + if (createdMatchesOriginal) { + LOGS(_log, LOG_LVL_INFO, fName << "created matches original"); + } else { + LOGS(_log, LOG_LVL_ERROR, "jsUjm != jsUjmCreated"); + LOGS(_log, LOG_LVL_ERROR, "jsUjm=" << jsUjm); + LOGS(_log, LOG_LVL_ERROR, "jsUjmCreated=" << jsUjmCreated); + } + BOOST_REQUIRE(createdMatchesOriginal); + return createdMatchesOriginal; +} + BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { string const replicationInstanceId = "repliInstId"; string const replicationAuthKey = "repliIAuthKey"; - uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); + //&&&uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); //&&&uint64_t wkrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 10s); - string const czrName("czar_name"); - lsst::qserv::CzarIdType const czrId = 32; - int czrPort = 2022; - string const czrHost("cz_host"); - LOGS(_log, LOG_LVL_WARN, "&&& testUJM a"); - auto czarA = - lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, cxrStartTime); + /* &&& + string const czrName("czar_name"); + lsst::qserv::CzarIdType const czrId = 32; + int czrPort = 2022; + string const czrHost("cz_host"); + LOGS(_log, LOG_LVL_WARN, "&&& testUJM a"); + auto czarA = + lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, + cxrStartTime); + LOGS(_log, LOG_LVL_WARN, "&&& testUJM b"); string jsStr = testA(); @@ -99,6 +138,10 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { LOGS(_log, LOG_LVL_ERROR, "jsUjmCreated=" << jsUjmCreated); } BOOST_REQUIRE(createdMatchesOriginal); + */ + + BOOST_REQUIRE(parseSerializeReparseCheck(testA(), "A")); + BOOST_REQUIRE(parseSerializeReparseCheck(testB(), "B")); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/qana/QueryMapping.h b/src/qana/QueryMapping.h index 2e8dca319..585971f97 100644 --- a/src/qana/QueryMapping.h +++ b/src/qana/QueryMapping.h @@ -92,6 +92,8 @@ class QueryMapping { bool hasParameter(Parameter p) const; DbTableSet const& getSubChunkTables() const { return _subChunkTables; } + std::string dump() const { return std::string("&&& NEED CODE"); } + private: ParameterMap _subs; DbTableSet _subChunkTables; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 9df2bbf08..3a709d5b2 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -236,6 +236,7 @@ void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { } void Executive::queueUberJob(std::shared_ptr const& uberJob) { + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&&uj queueUberJob"); auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index 10a9f13ba..635540297 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -59,8 +59,7 @@ namespace qdisp { class Executive; class ResponseHandler; -/** Description of a job managed by the executive - */ +/// Description of a job managed by the executive class JobDescription { public: using Ptr = std::shared_ptr; diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 990154803..0765a14ce 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -43,6 +43,7 @@ #include "qproc/ChunkQuerySpec.h" #include "util/Bug.h" #include "util/common.h" +#include "util/Histogram.h" //&&& #include "util/QdispPool.h" // LSST headers @@ -98,9 +99,12 @@ bool UberJob::addJob(JobQuery::Ptr const& job) { return success; } -void UberJob::runUberJob() { +util::HistogramRolling histoRunUberJob("&&&uj histoRunUberJob", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); +util::HistogramRolling histoUJSerialize("&&&uj histoUJSerialize", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); + +void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelled LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest start"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj start"); // Build the uberjob payload for each job. nlohmann::json uj; unique_lock jobsLock(_jobsMtx); @@ -149,8 +153,8 @@ void UberJob::runUberJob() { jsJobs.push_back(jsJob); jbPtr->getDescription()->resetJsForWorker(); // no longer needed. } -#else // &&& - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest a"); +#else // &&& + //&&&LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj a"); // Send the uberjob to the worker auto const method = http::Method::POST; auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); @@ -167,34 +171,51 @@ void UberJob::runUberJob() { auto uberJobMsg = protojson::UberJobMsg::create( http::MetaModule::version, czarConfig->replicationInstanceId(), czarConfig->replicationAuthKey(), czInfo, _wContactInfo, _queryId, _uberJobId, _rowLimit, maxTableSizeMB, scanInfoPtr, _jobs); + auto startserialize = CLOCK::now(); //&&& json request = uberJobMsg->serializeJson(); - - LOGS(_log, LOG_LVL_ERROR, "&&& jsonTESTrequest=" << request); + auto endserialize = CLOCK::now(); //&&& + std::chrono::duration secsserialize = endserialize - startserialize; // &&& + histoUJSerialize.addEntry(endserialize, secsserialize.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoUJSerialize.getString("")); +#endif // &&& + jobsLock.unlock(); // unlock so other _jobsMtx threads can advance while this waits for transmit + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj c"); + /* &&& { // &&& testing only, delete auto parsedReq = protojson::UberJobMsg::createFromJson(request); json jsParsedReq = parsedReq->serializeJson(); if (request == jsParsedReq) { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& YAY!!! "); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj YAY!!! "); } else { - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&& request != jsParsedReq"); - LOGS(_log, LOG_LVL_ERROR, "&&& request=" << request); - LOGS(_log, LOG_LVL_ERROR, "&&& jsParsedReq=" << jsParsedReq); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj noYAY request != jsParsedReq"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj request=" << request); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj jsParsedReq=" << jsParsedReq); } } - -#endif // &&& - jobsLock.unlock(); // unlock so other _jobsMtx threads can advance while this waits for transmit + */ LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " REQ " << request); string const requestContext = "Czar: '" + http::method2string(method) + "' request to '" + url + "'"; LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " czarPost url=" << url << " request=" << request.dump() << " headers=" << headers[0]); - http::Client client(method, url, request.dump(), headers); + auto startclient = CLOCK::now(); //&&& + + auto commandHttpPool = czar::Czar::getCzar()->getCommandHttpPool(); + http::ClientConfig clientConfig; + clientConfig.httpVersion = CURL_HTTP_VERSION_1_1; // same as in qhttp + clientConfig.bufferSize = CURL_MAX_READ_SIZE; // 10 MB in the current version of libcurl + clientConfig.tcpKeepAlive = true; + clientConfig.tcpKeepIdle = 30; // the default is 60 sec + clientConfig.tcpKeepIntvl = 5; // the default is 60 sec + http::Client client(method, url, request.dump(), headers, clientConfig, commandHttpPool); bool transmitSuccess = false; string exceptionWhat; try { + //&&&util::InstanceCount ic{"runUberJob&&&"}; + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj d"); json const response = client.readAsJson(); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj d1"); if (0 != response.at("success").get()) { transmitSuccess = true; } else { @@ -204,6 +225,10 @@ void UberJob::runUberJob() { LOGS(_log, LOG_LVL_WARN, requestContext + " ujresponse failed, ex: " + ex.what()); exceptionWhat = ex.what(); } + auto endclient = CLOCK::now(); //&&& + std::chrono::duration secsclient = endclient - startclient; // &&& + histoRunUberJob.addEntry(endclient, secsclient.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoRunUberJob.getString("")); if (!transmitSuccess) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " transmit failure, try to send jobs elsewhere"); _unassignJobs(); // locks _jobsMtx @@ -213,6 +238,7 @@ void UberJob::runUberJob() { } else { setStatusIfOk(qmeta::JobStatus::REQUEST, cName(__func__) + " transmitSuccess"); // locks _jobsMtx } + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj runuj end"); return; } diff --git a/src/qproc/ChunkQuerySpec.h b/src/qproc/ChunkQuerySpec.h index d7ad75984..41582368f 100644 --- a/src/qproc/ChunkQuerySpec.h +++ b/src/qproc/ChunkQuerySpec.h @@ -67,7 +67,8 @@ class ChunkQuerySpec { bool scanInteractive{false}; DbTableSet subChunkTables; std::vector subChunkIds; - std::vector queries; + std::vector queries; // &&& remove if possible + std::vector queryTemplates; // Consider promoting the concept of container of ChunkQuerySpec // in the hopes of increased code cleanliness. std::shared_ptr nextFragment; ///< ad-hoc linked list (consider removal) diff --git a/src/qproc/ChunkSpec.cc b/src/qproc/ChunkSpec.cc index 1bd36261f..0d1d0dba5 100644 --- a/src/qproc/ChunkSpec.cc +++ b/src/qproc/ChunkSpec.cc @@ -121,7 +121,9 @@ void normalize(ChunkSpecVector& specs) { //////////////////////////////////////////////////////////////////////// // ChunkSpec //////////////////////////////////////////////////////////////////////// -bool ChunkSpec::shouldSplit() const { return subChunks.size() > (unsigned)GOOD_SUBCHUNK_COUNT; } +//&&&bool ChunkSpec::shouldSplit() const { return subChunks.size() > (unsigned)GOOD_SUBCHUNK_COUNT; } +//&&& subchunks are handled in their own tasks now, so there's no point in splitting anymore. +bool ChunkSpec::shouldSplit() const { return false; } ChunkSpec ChunkSpec::intersect(ChunkSpec const& cs) const { ChunkSpec output(*this); diff --git a/src/qproc/QuerySession.cc b/src/qproc/QuerySession.cc index 969409a4d..7099e2647 100644 --- a/src/qproc/QuerySession.cc +++ b/src/qproc/QuerySession.cc @@ -391,6 +391,7 @@ std::vector QuerySession::_buildChunkQueries(query::QueryTemplate:: } for (auto&& queryTemplate : queryTemplates) { + LOGS(_log, LOG_LVL_WARN, "&&&uj QuerySession::_buildChunkQueries qt=" << queryTemplate.dump()); std::string str = _context->queryMapping->apply(chunkSpec, queryTemplate); chunkQueries.push_back(std::move(str)); } @@ -417,7 +418,7 @@ ChunkQuerySpec::Ptr QuerySession::buildChunkQuerySpec(query::QueryTemplate::Vect if (!_context->hasSubChunks()) { cQSpec->queries = _buildChunkQueries(queryTemplates, chunkSpec); } else { - if (chunkSpec.shouldSplit()) { + if (chunkSpec.shouldSplit()) { //&&& remove case ChunkSpecFragmenter frag(chunkSpec); ChunkSpec s = frag.get(); cQSpec->queries = _buildChunkQueries(queryTemplates, s); diff --git a/src/query/QueryTemplate.cc b/src/query/QueryTemplate.cc index 699a6faab..32e628e90 100644 --- a/src/query/QueryTemplate.cc +++ b/src/query/QueryTemplate.cc @@ -43,6 +43,8 @@ #include "query/ColumnRef.h" #include "query/TableRef.h" +using namespace std; + namespace lsst::qserv::query { //////////////////////////////////////////////////////////////////////// @@ -204,4 +206,18 @@ QueryTemplate::GetAliasMode QueryTemplate::getTableAliasMode() const { return DONT_USE; // should never get here but to satisfy the compiler. } +string QueryTemplate::dump() const { + ostringstream os; + os << "QueryTemplate quoteIdents=" << _quoteIdentifiers; + os << " useColOnly=" << _useColumnOnly; + os << " aliasMode=" << _aliasMode; + os << " entries={"; + for (auto const& entry : _entries) { + os << "(dynamic=" << entry->isDynamic(); + os << ":val=" << entry->getValue() << ")"; + } + os << "}"; + return os.str(); +} + } // namespace lsst::qserv::query diff --git a/src/query/QueryTemplate.h b/src/query/QueryTemplate.h index 5be5e3ac0..b0ffad8ba 100644 --- a/src/query/QueryTemplate.h +++ b/src/query/QueryTemplate.h @@ -208,6 +208,8 @@ class QueryTemplate { return os << qt.sqlFragment(); } + std::string dump() const; + private: EntryPtrVector _entries; SetAliasMode _aliasMode{USE_ALIAS}; diff --git a/src/util/InstanceCount.cc b/src/util/InstanceCount.cc index 9940523f3..895698d63 100644 --- a/src/util/InstanceCount.cc +++ b/src/util/InstanceCount.cc @@ -31,7 +31,8 @@ void InstanceCount::_increment(std::string const& source) { auto ret = _instances.insert(entry); auto iter = ret.first; iter->second += 1; - LOGS(_log, LOG_LVL_DEBUG, "InstanceCount " << source << " " << iter->first << "=" << iter->second); + LOGS(_log, LOG_LVL_WARN, + "InstanceCount " << source << " " << iter->first << "=" << iter->second); //&&&DEBUG } InstanceCount::~InstanceCount() { @@ -39,7 +40,8 @@ InstanceCount::~InstanceCount() { auto iter = _instances.find(_className); if (iter != _instances.end()) { iter->second -= 1; - LOGS(_log, LOG_LVL_DEBUG, "~InstanceCount " << iter->first << "=" << iter->second << " : " << *this); + LOGS(_log, LOG_LVL_WARN, + "~InstanceCount " << iter->first << "=" << iter->second << " : " << *this); //&&&DEBUG if (iter->second == 0) { _instances.erase(_className); } diff --git a/src/wsched/BlendScheduler.cc b/src/wsched/BlendScheduler.cc index ccb335b97..b5b37346f 100644 --- a/src/wsched/BlendScheduler.cc +++ b/src/wsched/BlendScheduler.cc @@ -259,6 +259,7 @@ void BlendScheduler::commandStart(util::Command::Ptr const& cmd) { LOGS(_log, LOG_LVL_ERROR, "BlendScheduler::commandStart scheduler not found"); } _infoChanged = true; + LOGS(_log, LOG_LVL_DEBUG, "BlendScheduler::commandStart &&& end"); } void BlendScheduler::commandFinish(util::Command::Ptr const& cmd) { From 8ad73c1bf3148b84bd52d1c7d32d4c2ba694a660 Mon Sep 17 00:00:00 2001 From: John Gates Date: Fri, 13 Dec 2024 13:39:36 -0800 Subject: [PATCH 19/22] Rearranged UberJob building and removed chunkResultName. --- src/ccontrol/UserQuerySelect.cc | 89 ++++++++++++++++++--------------- src/http/CMakeLists.txt | 2 - src/protojson/ScanTableInfo.cc | 6 --- src/protojson/UberJobMsg.cc | 66 +++--------------------- src/protojson/UberJobMsg.h | 66 ++++-------------------- src/protojson/testUberJobMsg.cc | 19 +------ src/qdisp/Executive.cc | 31 ++++++++---- src/qdisp/Executive.h | 9 ++-- src/qdisp/testQDisp.cc | 4 +- src/wbase/Task.cc | 87 +------------------------------- 10 files changed, 93 insertions(+), 286 deletions(-) diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index c5e2aef35..e4394c725 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -105,7 +105,6 @@ #include "rproc/InfileMerger.h" #include "sql/Schema.h" #include "util/Bug.h" -#include "util/InstanceCount.h" //&&& #include "util/IterableFormatter.h" #include "util/Histogram.h" //&&& #include "util/QdispPool.h" @@ -275,6 +274,9 @@ void UserQuerySelect::submit() { // Make the JobQuery now QSERV_LOGCONTEXT_QUERY(_qMetaQueryId); + // TODO:UJ The template(s) is generated here and later it is compared to other + // templates. It would be better to create the list of query templates here + // and just store the index into the list of templates in the `cs`. qproc::ChunkQuerySpec::Ptr cs; { std::lock_guard lock(chunksMtx); @@ -328,10 +330,9 @@ util::HistogramRolling histoBuildAndS("&&&uj histoBuildAndS", {0.1, 1.0, 10.0, 1 util::HistogramRolling histoBuildAndS1("&&&uj histoBuildAndS1", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); void UserQuerySelect::buildAndSendUberJobs() { - util::InstanceCount ic("UserQuerySelect::buildAndSendUberJobs&&&"); // TODO:UJ Is special handling needed for the dummy chunk, 1234567890 ? string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); - LOGS(_log, LOG_LVL_DEBUG, funcN << " start"); + LOGS(_log, LOG_LVL_DEBUG, funcN << " start " << _uberJobMaxChunks); LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj start " << _uberJobMaxChunks); // Ensure `_monitor()` doesn't do anything until everything is ready. @@ -341,9 +342,9 @@ void UserQuerySelect::buildAndSendUberJobs() { } // Only one thread should be generating UberJobs for this user query at any given time. - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj lock before"); + util::InstanceCount ica("UserQuerySelect::buildAndSendUberJobs&&&_beforelock"); lock_guard fcLock(_buildUberJobMtx); - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj lock after"); + util::InstanceCount icb("UserQuerySelect::buildAndSendUberJobs&&&_afterlock"); LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << _executive->getTotalJobs()); vector uberJobs; @@ -355,10 +356,12 @@ void UserQuerySelect::buildAndSendUberJobs() { return; } + // Get czar info and the worker contactMap. auto czarPtr = czar::Czar::getCzar(); auto czFamilyMap = czarPtr->getCzarFamilyMap(); auto czChunkMap = czFamilyMap->getChunkMap(_queryDbName); auto czRegistry = czarPtr->getCzarRegistry(); + auto const wContactMap = czRegistry->waitForWorkerContactMap(); if (czChunkMap == nullptr) { LOGS(_log, LOG_LVL_ERROR, funcN << " no map found for queryDbName=" << _queryDbName); @@ -388,12 +391,18 @@ void UserQuerySelect::buildAndSendUberJobs() { // again to put those Jobs in new UberJobs. Correctly re-assigning the // Jobs requires accurate information from the registry about which workers // are alive or dead. - map> workerJobMap; + struct WInfoAndUJPtr { + using Ptr = shared_ptr; + qdisp::UberJob::Ptr uberJobPtr; + protojson::WorkerContactInfo::Ptr wInf; + }; + //&&& map> workerJobMap; + map workerJobMap; vector missingChunks; auto startassign = CLOCK::now(); //&&& // unassignedChunksInQuery needs to be in numerical order so that UberJobs contain chunk numbers in - // numerical order. The workers run shared scans in numerical order of chunk id numbers. + // numerical order. The workers run shared scans in numerical order of chunkId numbers. // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, // and should minimize the time for the first UberJob on the worker to complete. for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) { @@ -442,26 +451,43 @@ void UserQuerySelect::buildAndSendUberJobs() { } // Add this job to the appropriate UberJob, making the UberJob if needed. string workerId = targetWorker->getWorkerId(); - auto& ujVect = workerJobMap[workerId]; - if (ujVect.empty() || ujVect.back()->getJobCount() >= _uberJobMaxChunks) { + WInfoAndUJPtr::Ptr& wInfUJ = workerJobMap[workerId]; + if (wInfUJ == nullptr) { + wInfUJ = make_shared(); + auto iter = wContactMap->find(workerId); //&&&auto iter = wContactMap->find(wIdKey); + if (iter == wContactMap->end()) { + // TODO:UJ Not appropriate to throw for this. Need to re-direct all jobs to different workers. + // Also, this really shouldn't happen, but crashing the czar is probably a bad idea, + // so maybe return internal error to the user? + throw util::Bug(ERR_LOC, funcN + " TODO:UJ no contact information for " + workerId); + } + wInfUJ->wInf = iter->second; + } + + if (wInfUJ->uberJobPtr == nullptr) { auto ujId = _uberJobIdSeq++; // keep ujId consistent string uberResultName = _ttn->make(ujId); auto respHandler = make_shared(_infileMerger, uberResultName); auto uJob = qdisp::UberJob::create(_executive, respHandler, _executive->getId(), ujId, _qMetaCzarId, targetWorker); - ujVect.push_back(uJob); + uJob->setWorkerContactInfo(wInfUJ->wInf); + wInfUJ->uberJobPtr = uJob; + }; + + wInfUJ->uberJobPtr->addJob(jqPtr); + + if (wInfUJ->uberJobPtr->getJobCount() >= _uberJobMaxChunks) { + // Queue the UberJob to be sent to a worker + _executive->addAndQueueUberJob(wInfUJ->uberJobPtr); + + // Clear the pinter so a new UberJob is created later if needed. + wInfUJ->uberJobPtr = nullptr; } - auto& ujVectBack = ujVect.back(); - ujVectBack->addJob(jqPtr); - LOGS(_log, LOG_LVL_TRACE, - funcN << " ujVectBack{" << ujVectBack->getIdStr() << " jobCnt=" << ujVectBack->getJobCount() - << "}"); } auto endassign = CLOCK::now(); //&&& std::chrono::duration secsassign = endassign - startassign; // &&& histoBuildAndS.addEntry(endassign, secsassign.count()); //&&& LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoBuildAndS.getString("")); - auto startwcont = CLOCK::now(); //&&& if (!missingChunks.empty()) { string errStr = funcN + " a worker could not be found for these chunks "; @@ -471,32 +497,17 @@ void UserQuerySelect::buildAndSendUberJobs() { errStr += " they will be retried later."; LOGS(_log, LOG_LVL_ERROR, errStr); } - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj waitForWorkerContactMap"); - // Add worker contact info to UberJobs. The czar can't do anything without - // the contact map, so it will wait. This should only ever be an issue at startup. - auto const wContactMap = czRegistry->waitForWorkerContactMap(); - for (auto const& [wIdKey, ujVect] : workerJobMap) { - auto iter = wContactMap->find(wIdKey); - if (iter == wContactMap->end()) { - // TODO:UJ Not appropriate to throw for this. Need to re-direct all jobs to different workers. - // Also, this really shouldn't happen, but crashing the czar is probably a bad idea, - // so maybe return internal error to the user? - throw util::Bug(ERR_LOC, funcN + " TODO:UJ no contact information for " + wIdKey); - } - auto const& wContactInfo = iter->second; - for (auto const& ujPtr : ujVect) { - ujPtr->setWorkerContactInfo(wContactInfo); - } - _executive->addUberJobs(ujVect); - for (auto const& ujPtr : ujVect) { - _executive->queueUberJob(ujPtr); + // Queue unqued UberJobs, these have less than the max number of jobs. + for (auto const& [wIdKey, winfUjPtr] : workerJobMap) { + if (winfUjPtr != nullptr) { + auto& ujPtr = winfUjPtr->uberJobPtr; + if (ujPtr != nullptr) { + _executive->addAndQueueUberJob(ujPtr); + } } } - auto endwcont = CLOCK::now(); //&&& - std::chrono::duration secswcont = endwcont - startwcont; // &&& - histoBuildAndS1.addEntry(endwcont, secswcont.count()); //&&& - LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoBuildAndS1.getString("")); + LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj " << _executive->dumpUberJobCounts()); } diff --git a/src/http/CMakeLists.txt b/src/http/CMakeLists.txt index f2ea0e0a8..454d4ab88 100644 --- a/src/http/CMakeLists.txt +++ b/src/http/CMakeLists.txt @@ -19,7 +19,6 @@ target_sources(http PRIVATE RequestBodyJSON.cc RequestQuery.cc Url.cc -# &&& WorkerQueryStatusData.cc ) target_link_libraries(http PUBLIC @@ -52,6 +51,5 @@ http_tests( testAsyncReq testRequestBodyJSON testRequestQuery -# &&& testStatusData testUrl ) diff --git a/src/protojson/ScanTableInfo.cc b/src/protojson/ScanTableInfo.cc index ae7deb254..32da583bf 100644 --- a/src/protojson/ScanTableInfo.cc +++ b/src/protojson/ScanTableInfo.cc @@ -99,14 +99,10 @@ void ScanInfo::sortTablesSlowestFirst() { } nlohmann::json ScanInfo::serializeJson() const { - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson a"); auto jsScanInfo = json({{"infoscanrating", scanRating}, {"infotables", json::array()}}); - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson b"); auto& jsInfoTables = jsScanInfo["infotables"]; - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson c"); for (auto const& tInfo : infoTables) { - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson c1"); json jsTInfo = json({{"sidb", tInfo.db}, {"sitable", tInfo.table}, {"sirating", tInfo.scanRating}, @@ -115,12 +111,10 @@ nlohmann::json ScanInfo::serializeJson() const { jsInfoTables.push_back(jsTInfo); } - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::serializeJson end " << jsScanInfo); return jsScanInfo; } ScanInfo::Ptr ScanInfo::createFromJson(nlohmann::json const& siJson) { - LOGS(_log, LOG_LVL_WARN, "&&& ScanInfo::createFromJson " << siJson); Ptr siPtr = create(); auto& iTbls = siPtr->infoTables; diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc index 8f617103c..f12f37fe2 100644 --- a/src/protojson/UberJobMsg.cc +++ b/src/protojson/UberJobMsg.cc @@ -90,8 +90,6 @@ json UberJobMsg::serializeJson() const { auto& jsJobs = ujmJson["jobs"]; for (auto const& jbMsg : *_jobMsgVect) { - //&&&json jsJob = jbMsg->serializeJson(); - //&&&jsJobs.push_back(jsJob); jsJobs.emplace_back(jbMsg->serializeJson()); } @@ -100,7 +98,7 @@ json UberJobMsg::serializeJson() const { } UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson ujmJson=" << ujmJson); + LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson ujmJson=" << ujmJson); try { if (ujmJson["version"] != http::MetaModule::version) { LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson bad version " << ujmJson["version"]); @@ -113,7 +111,6 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { return nullptr; } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson b"); auto scanInfo_ = ScanInfo::createFromJson(ujmJson["scaninfo"]); if (scanInfo_ == nullptr) { LOGS(_log, LOG_LVL_ERROR, @@ -132,16 +129,11 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { auto czInfo = CzarContactInfo::createFromJson(ujmJson["czarinfo"]); auto jsUjJobs = http::RequestBodyJSON::required(ujmJson, "jobs"); - LOGS(_log, LOG_LVL_TRACE, - " &&& " << metaVersion << replicationInstanceId << replicationAuthKey << workerId << qId << ujId - << rowLimit << jsUjJobs); - std::vector> emptyJobs; Ptr ujmPtr = Ptr(new UberJobMsg(metaVersion, replicationInstanceId, replicationAuthKey, czInfo, workerId, qId, ujId, rowLimit, maxTableSizeMB, scanInfo_, emptyJobs)); - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson m"); auto const& jsSubQueriesMap = http::RequestBodyJSON::required(ujmJson, "subqueries_map"); ujmPtr->_jobSubQueryTempMap = JobSubQueryTempMap::createFromJson(jsSubQueriesMap); @@ -153,22 +145,16 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { JobMsg::createFromJson(jsUjJob, ujmPtr->_jobSubQueryTempMap, ujmPtr->_jobDbTablesMap); ujmPtr->_jobMsgVect->push_back(jobMsgPtr); } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson end"); + LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson end"); return ujmPtr; } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson invalid " << exc.what() << " json=" << ujmJson); } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::createFromJson end error"); + LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson end error"); return nullptr; } -std::string UberJobMsg::dump() const { - stringstream os; - os << "&&& NEEDS CODE"; - return os.str(); -} - JobMsg::Ptr JobMsg::create(std::shared_ptr const& jobPtr, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) { @@ -179,7 +165,6 @@ JobMsg::Ptr JobMsg::create(std::shared_ptr const& jobPtr, JobMsg::JobMsg(std::shared_ptr const& jobPtr, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) : _jobSubQueryTempMap(jobSubQueryTempMap), _jobDbTablesMap(jobDbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg start"); auto const descr = jobPtr->getDescription(); if (descr == nullptr) { throw util::Bug(ERR_LOC, cName(__func__) + " description=null for job=" + jobPtr->getIdStr()); @@ -191,7 +176,6 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, _scanRating = chunkQuerySpec->scanInfo->scanRating; _scanInteractive = chunkQuerySpec->scanInteractive; _chunkId = chunkQuerySpec->chunkId; - _chunkResultName = descr->getChunkResultName(); // Add scan tables (&&& not sure is this is the same for all jobs or not) for (auto const& sTbl : chunkQuerySpec->scanInfo->infoTables) { @@ -213,7 +197,6 @@ nlohmann::json JobMsg::serializeJson() const { {"scanPriority", _scanRating}, {"scanInteractive", _scanInteractive}, {"chunkId", _chunkId}, - {"chunkresultname", _chunkResultName}, {"chunkscantables_indexes", nlohmann::json::array()}, {"queryFragments", json::array()}}); @@ -226,8 +209,6 @@ nlohmann::json JobMsg::serializeJson() const { auto& jsqFrags = jsJobMsg["queryFragments"]; for (auto& jFrag : *_jobFragments) { - //&&&auto jsFrag = jFrag->serializeJson(); - //&&&jsqFrags.push_back(jsFrag); jsqFrags.emplace_back(jFrag->serializeJson()); } @@ -237,33 +218,31 @@ nlohmann::json JobMsg::serializeJson() const { JobMsg::JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap, JobId jobId, int attemptCount, std::string const& chunkQuerySpecDb, int scanRating, - bool scanInteractive, int chunkId, std::string const& chunkResultName) + bool scanInteractive, int chunkId) : _jobId(jobId), _attemptCount(attemptCount), _chunkQuerySpecDb(chunkQuerySpecDb), _scanRating(scanRating), _scanInteractive(scanInteractive), _chunkId(chunkId), - _chunkResultName(chunkResultName), _jobSubQueryTempMap(jobSubQueryTempMap), _jobDbTablesMap(jobDbTablesMap) {} JobMsg::Ptr JobMsg::createFromJson(nlohmann::json const& ujJson, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson ujJson=" << ujJson); + LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson"); JobId jobId = http::RequestBodyJSON::required(ujJson, "jobId"); int attemptCount = http::RequestBodyJSON::required(ujJson, "attemptCount"); string chunkQuerySpecDb = http::RequestBodyJSON::required(ujJson, "querySpecDb"); int scanRating = http::RequestBodyJSON::required(ujJson, "scanPriority"); bool scanInteractive = http::RequestBodyJSON::required(ujJson, "scanInteractive"); int chunkId = http::RequestBodyJSON::required(ujJson, "chunkId"); - string chunkResultName = http::RequestBodyJSON::required(ujJson, "chunkresultname"); json jsQFrags = http::RequestBodyJSON::required(ujJson, "queryFragments"); Ptr jMsgPtr = Ptr(new JobMsg(jobSubQueryTempMap, jobDbTablesMap, jobId, attemptCount, chunkQuerySpecDb, - scanRating, scanInteractive, chunkId, chunkResultName)); + scanRating, scanInteractive, chunkId)); json jsChunkTblIndexes = http::RequestBodyJSON::required(ujJson, "chunkscantables_indexes"); jMsgPtr->_chunkScanTableIndexes = jsChunkTblIndexes.get>(); jMsgPtr->_jobFragments = @@ -296,27 +275,17 @@ json JobSubQueryTempMap::serializeJson() const { JobSubQueryTempMap::Ptr JobSubQueryTempMap::createFromJson(nlohmann::json const& ujJson) { LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson a"); Ptr sqtMapPtr = create(); - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson b"); auto& sqtMap = sqtMapPtr->_qTemplateMap; LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::createFromJson " << ujJson); auto const& jsElements = ujJson["subquerytemplate_map"]; - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c"); for (auto const& jsElem : jsElements) { - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c1"); - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson jsElem=" << jsElem); - //&&&int index = jsElem["index"]; int index = http::RequestBodyJSON::required(jsElem, "index"); - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c2"); - //&&&string templ = jsElem["template"]; string templ = http::RequestBodyJSON::required(jsElem, "template"); - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c3"); auto res = sqtMap.insert(make_pair(index, templ)); - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c4"); if (!res.second) { throw invalid_argument(sqtMapPtr->cName(__func__) + "index=" + to_string(index) + "=" + templ + " index already found in " + to_string(ujJson)); } - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson c5"); } LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson end"); return sqtMapPtr; @@ -327,7 +296,6 @@ int JobSubQueryTempMap::findSubQueryTemp(string const& qTemp) { // so this shouldn't be horribly expensive. for (auto const& [key, temp] : _qTemplateMap) { if (temp == qTemp) { - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end key=" << key); return key; } } @@ -335,7 +303,6 @@ int JobSubQueryTempMap::findSubQueryTemp(string const& qTemp) { // Need to insert int index = _qTemplateMap.size(); _qTemplateMap[index] = qTemp; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::findSubQueryTemp end index=" << index); return index; } @@ -382,13 +349,9 @@ JobDbTablesMap::Ptr JobDbTablesMap::createFromJson(nlohmann::json const& ujJson) LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson " << ujJson); json const& jsDbTbl = ujJson["dbtable_map"]; - LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson dbtbl=" << jsDbTbl); for (auto const& jsElem : jsDbTbl) { - //&&&int index = jsElem["index"]; int index = http::RequestBodyJSON::required(jsElem, "index"); - //&&&string db = jsElem["db"]; string db = http::RequestBodyJSON::required(jsElem, "db"); - //&&&string tbl = jsElem["table"]; string tbl = http::RequestBodyJSON::required(jsElem, "table"); auto res = dbTblMap.insert(make_pair(index, make_pair(db, tbl))); if (!res.second) { @@ -398,13 +361,9 @@ JobDbTablesMap::Ptr JobDbTablesMap::createFromJson(nlohmann::json const& ujJson) } json const& jsScanR = ujJson["scanrating_map"]; - LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson jsScanR=" << jsScanR); for (auto const& jsElem : jsScanR) { - //&&&int index = jsElem["index"]; int index = http::RequestBodyJSON::required(jsElem, "index"); - //&&&int scanR = jsElem["scanrating"]; int scanR = http::RequestBodyJSON::required(jsElem, "scanrating"); - //&&&bool lockInMem = jsElem["lockinmem"]; bool lockInMem = http::RequestBodyJSON::required(jsElem, "lockinmem"); auto res = scanRMap.insert(make_pair(index, make_pair(scanR, lockInMem))); if (!res.second) { @@ -452,15 +411,6 @@ JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQ qproc::ChunkQuerySpec const* sPtr = &chunkQuerySpec; while (sPtr) { LOGS(_log, LOG_LVL_TRACE, "nextFragment"); - for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { // &&& del loop - LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); - } - /* &&& - for (auto const& sbi : sPtr->subChunkIds) { // &&& del loop - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect a2a"); - LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); - } - */ // Linked fragments will not have valid subChunkTables vectors, // So, we reuse the root fragment's vector. _addFragment(*jFragments, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, sPtr->queries, @@ -469,9 +419,6 @@ JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQ } } else { LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { // &&& del loop - LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); - } _addFragment(*jFragments, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, chunkQuerySpec.queries, jobSubQueryTempMap, jobDbTablesMap); } @@ -480,7 +427,6 @@ JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQ return jFragments; } -//&&&void JobFragment::_addFragment(std::vector& jFragments, std::string const& resultTblName, void JobFragment::_addFragment(std::vector& jFragments, DbTableSet const& subChunkTables, std::vector const& subchunkIds, std::vector const& queries, JobSubQueryTempMap::Ptr const& subQueryTemplates, diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h index 9203e74ae..73c6276d9 100644 --- a/src/protojson/UberJobMsg.h +++ b/src/protojson/UberJobMsg.h @@ -143,18 +143,6 @@ class JobFragment { JobFragment() = delete; JobFragment(JobFragment const&) = delete; - /* &&& - static VectPtr createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, std::string const& resultTblName); - - /// &&& doc - static VectPtr createVectFromJson(nlohmann::json const& ujJson, - JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, - JobDbTablesMap::Ptr const& dbTablesMap, - std::string const& resultTblName); - */ - static VectPtr createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& dbTablesMap); @@ -170,15 +158,10 @@ class JobFragment { std::vector const& getJobSubQueryTempIndexes() const { return _jobSubQueryTempIndexes; } std::vector const& getJobDbTablesIndexes() const { return _jobDbTablesIndexes; } std::vector const& getSubchunkIds() const { return _subchunkIds; } - //&&&std::string const& getResultTblName() const { return _resultTblName; } std::string dump() const; private: - /* &&& - JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap, - std::string const& resultTblName); - */ JobFragment(JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap); /// &&& doc @@ -187,14 +170,6 @@ class JobFragment { JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap); - /* &&& - static void _addFragment(std::vector& jFragments, std::string const& resultTblName, - DbTableSet const& subChunkTables, std::vector const& subchunkIds, - std::vector const& queries, - JobSubQueryTempMap::Ptr const& subQueryTemplates, - JobDbTablesMap::Ptr const& dbTablesMap); - */ - JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< &&& doc std::vector _jobSubQueryTempIndexes; ///< &&& doc @@ -202,11 +177,6 @@ class JobFragment { std::vector _jobDbTablesIndexes; ///< &&& doc std::vector _subchunkIds; ///< &&& doc - - /* &&& - std::string _resultTblName; ///< &&& doc &&& probably not needed here. Replace with - ///< JobMsg::_chunkResultName field. - */ }; /// This class is used to store the information for a single Job (the queries and metadata @@ -239,7 +209,6 @@ class JobMsg { int getScanRating() const { return _scanRating; } bool getScanInteractive() const { return _scanInteractive; } int getChunkId() const { return _chunkId; } - std::string getChunkResultName() const { return _chunkResultName; } std::vector const& getChunkScanTableIndexes() const { return _chunkScanTableIndexes; } @@ -251,7 +220,7 @@ class JobMsg { JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap, JobId jobId, int attemptCount, std::string const& chunkQuerySpecDb, int scanRating, - bool scanInteractive, int chunkId, std::string const& chunkResultName); + bool scanInteractive, int chunkId); JobId _jobId; int _attemptCount; @@ -259,7 +228,6 @@ class JobMsg { int _scanRating; bool _scanInteractive; int _chunkId; - std::string _chunkResultName; JobFragment::VectPtr _jobFragments{new JobFragment::Vect()}; JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< Map of all query templates related to this UberJob. @@ -311,15 +279,11 @@ class UberJobMsg : public std::enable_shared_from_this { ScanInfo::Ptr getScanInfo() const { return _scanInfo; } - std::string dump() const; - private: UberJobMsg(unsigned int metaVersion, std::string const& replicationInstanceId, - std::string const& replicationAuthKey, - //&&&CzarContactInfo::Ptr const& czInfo, WorkerContactInfo::Ptr const& wInfo, - CzarContactInfo::Ptr const& czInfo, std::string const& workerId, QueryId qId, UberJobId ujId, - int rowLimit, int maxTableSizeMB, ScanInfo::Ptr const& scanInfo_, - std::vector> const& jobs); + std::string const& replicationAuthKey, CzarContactInfo::Ptr const& czInfo, + std::string const& workerId, QueryId qId, UberJobId ujId, int rowLimit, int maxTableSizeMB, + ScanInfo::Ptr const& scanInfo_, std::vector> const& jobs); unsigned int _metaVersion; // "version", http::MetaModule::version // czar @@ -327,22 +291,10 @@ class UberJobMsg : public std::enable_shared_from_this { std::string _replicationAuthKey; //"auth_key", czarConfig->replicationAuthKey() CzarContactInfo::Ptr _czInfo; std::string _workerId; // "worker", ciwId - //&&&WorkerContactInfo::Ptr _wInfo; // &&& probably not needed - // &&& {"czarinfo", - //&&&std::string _czarName; // "name", czarConfig->name() - //&&&qmeta::czarId _czarId; // "id", czarConfig->id() - //&&&uint16_t _czarManagementPort; // "management-port", czarConfig->replicationHttpPort() - //&&&std::string _czarManagementHostName; // "management-host-name", util::get_current_host_fqdn() - // &&& } - // &&&{"uberjob", - QueryId _qId; // "queryid", _queryId - UberJobId _ujId; // "uberjobid", _uberJobId - //&&& CzarIdType _czarId; // "czarid", _czarId - int _rowLimit; // "rowlimit", _rowLimit - int _maxTableSizeMB; // - - //&&&std::vector> _jobs; // &&& needs to be replaced with jobData - // &&& }; + QueryId _qId; // "queryid", _queryId + UberJobId _ujId; // "uberjobid", _uberJobId + int _rowLimit; // "rowlimit", _rowLimit + int _maxTableSizeMB; // /// Map of all query templates related to this UberJob. JobSubQueryTempMap::Ptr _jobSubQueryTempMap{JobSubQueryTempMap::create()}; @@ -353,7 +305,7 @@ class UberJobMsg : public std::enable_shared_from_this { /// List of all job data in this UberJob. "jobs", json::array() JobMsg::VectPtr _jobMsgVect{new JobMsg::Vect()}; - ScanInfo::Ptr _scanInfo{ScanInfo::create()}; ///< &&& NEED to add to serialize and createFromJson + ScanInfo::Ptr _scanInfo{ScanInfo::create()}; ///< &&& doc }; } // namespace lsst::qserv::protojson diff --git a/src/protojson/testUberJobMsg.cc b/src/protojson/testUberJobMsg.cc index e0b056422..32412e865 100644 --- a/src/protojson/testUberJobMsg.cc +++ b/src/protojson/testUberJobMsg.cc @@ -100,29 +100,13 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { string const replicationInstanceId = "repliInstId"; string const replicationAuthKey = "repliIAuthKey"; - //&&&uint64_t cxrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 5s); - //&&&uint64_t wkrStartTime = lsst::qserv::millisecSinceEpoch(lsst::qserv::CLOCK::now() - 10s); - - /* &&& - string const czrName("czar_name"); - lsst::qserv::CzarIdType const czrId = 32; - int czrPort = 2022; - string const czrHost("cz_host"); - LOGS(_log, LOG_LVL_WARN, "&&& testUJM a"); - auto czarA = - lsst::qserv::protojson::CzarContactInfo::create(czrName, czrId, czrPort, czrHost, - cxrStartTime); - - - LOGS(_log, LOG_LVL_WARN, "&&& testUJM b"); + LOGS(_log, LOG_LVL_INFO, "testUJM start"); string jsStr = testA(); nlohmann::json js = nlohmann::json::parse(jsStr); UberJobMsg::Ptr ujm = UberJobMsg::createFromJson(js); BOOST_REQUIRE(ujm != nullptr); - LOGS(_log, LOG_LVL_WARN, "&&& testUJM c"); nlohmann::json jsUjm = ujm->serializeJson(); - LOGS(_log, LOG_LVL_WARN, "&&& testUJM d"); LOGS(_log, LOG_LVL_INFO, "js=" << js); LOGS(_log, LOG_LVL_INFO, "jsUjm=" << jsUjm); @@ -138,7 +122,6 @@ BOOST_AUTO_TEST_CASE(WorkerQueryStatusData) { LOGS(_log, LOG_LVL_ERROR, "jsUjmCreated=" << jsUjmCreated); } BOOST_REQUIRE(createdMatchesOriginal); - */ BOOST_REQUIRE(parseSerializeReparseCheck(testA(), "A")); BOOST_REQUIRE(parseSerializeReparseCheck(testB(), "B")); diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 3a709d5b2..983a0bf94 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -235,6 +235,7 @@ void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { } } +/* &&& void Executive::queueUberJob(std::shared_ptr const& uberJob) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&&uj queueUberJob"); auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; @@ -247,6 +248,27 @@ void Executive::queueUberJob(std::shared_ptr const& uberJob) { _qdispPool->queCmd(cmd, 1); } } +*/ + +void Executive::addAndQueueUberJob(shared_ptr const& uj) { + { + lock_guard lck(_uberJobsMapMtx); + UberJobId ujId = uj->getJobId(); + _uberJobsMap[ujId] = uj; + //&&&uj->setAdded(); + LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uj->getJobCount()); + } + + auto runUberJobFunc = [uj](util::CmdData*) { uj->runUberJob(); }; + + auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); + _jobStartCmdList.push_back(cmd); + if (_scanInteractive) { + _qdispPool->queCmd(cmd, 0); + } else { + _qdispPool->queCmd(cmd, 1); + } +} void Executive::waitForAllJobsToStart() { LOGS(_log, LOG_LVL_INFO, "waitForAllJobsToStart"); @@ -273,15 +295,6 @@ Executive::ChunkIdJobMapType Executive::unassignedChunksInQuery() { return unassignedMap; } -void Executive::addUberJobs(std::vector> const& uJobsToAdd) { - lock_guard lck(_uberJobsMapMtx); - for (auto const& uJob : uJobsToAdd) { - UberJobId ujId = uJob->getJobId(); - _uberJobsMap[ujId] = uJob; - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uJob->getJobCount()); - } -} - string Executive::dumpUberJobCounts() const { stringstream os; os << "exec=" << getIdStr(); diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index c9bcba0ff..c2cef1a34 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -129,8 +129,8 @@ class Executive : public std::enable_shared_from_this { /// Add an item with a reference number std::shared_ptr add(JobDescription::Ptr const& s); - // Queue `uberJob` to be run using the QDispPool. - void queueUberJob(std::shared_ptr const& uberJob); + /// Add the UberJob `uj` to the list and queue it to be sent to a worker. + void addAndQueueUberJob(std::shared_ptr const& uj); /// Queue `cmd`, using the QDispPool, so it can be used to collect the result file. void queueFileCollect(std::shared_ptr const& cmd); @@ -197,9 +197,6 @@ class Executive : public std::enable_shared_from_this { /// @see python module lsst.qserv.czar.proxy.unlock() void updateProxyMessages(); - /// Add UbjerJobs to this user query. - void addUberJobs(std::vector> const& jobsToAdd); - /// Call UserQuerySelect::buildAndSendUberJobs make new UberJobs for /// unassigned jobs. virtual void assignJobsToUberJobs(); @@ -345,7 +342,7 @@ class Executive : public std::enable_shared_from_this { /// Flag that is set to true when ready to create and run UberJobs. std::atomic _readyToExecute{false}; - protojson::ScanInfo::Ptr _scanInfo; ///< &&& doc + protojson::ScanInfo::Ptr _scanInfo; ///< Scan rating and tables. }; } // namespace qdisp diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index c0759ce60..005e7d934 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -182,10 +182,8 @@ class ExecutiveUT : public Executive { ujVect.push_back(uJob); } - // Queue up the jobs to be run. - addUberJobs(ujVect); for (auto const& ujPtr : ujVect) { - queueUberJob(ujPtr); + addAndQueueUberJob(ujPtr); } LOGS(_log, LOG_LVL_INFO, "assignJobsToUberJobs() end"); } diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index c7efe8272..0ea23c602 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -272,33 +272,6 @@ std::vector Task::createTasksForChunk( if (fragSubchunkIds.empty()) { bool const noSubchunks = false; int const subchunkId = -1; - { - ostringstream os; - os << "&&&TEST00 "; - os << " &&&TEST"; - os << "; ujData?"; - os << "; jobId=" << jdJobId; - os << "; attemptCount=" << jdAttemptCount; - os << "; chunkId=" << jdChunkId; - os << "; fragmentNumber=" << fragmentNumber; - os << "; templateId=" << templateId; - os << "; noSubchunks=" << noSubchunks; - os << "; subchunkId=" << subchunkId; - os << "; chunkQuerySpecDb?"; - os << "; scanInfo=" << *scanInfo; - os << "; scanInteractive=" << scanInteractive; - os << "; maxTableSizeMb=" << maxTableSizeMb; - os << "; fragSubTables={"; - for (auto const& fsTbl : fragSubTables) { - os << fsTbl.db << "." << fsTbl.tbl << ", "; - } - os << "}"; - os << "; fragSubchunkIds=" << util::printable(fragSubchunkIds); - os << "; sendChannel?"; - os << "; queryStats?"; - os << "; resultsHttpPort=" << resultsHttpPort; - LOGS(_log, LOG_LVL_WARN, "&&&" << os.str()); - } auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, @@ -308,33 +281,6 @@ std::vector Task::createTasksForChunk( } else { for (auto subchunkId : fragSubchunkIds) { bool const hasSubchunks = true; - { - ostringstream os; - os << "&&&TEST01 "; - os << " &&&TEST"; - os << "; ujData?"; - os << "; jobId=" << jdJobId; - os << "; attemptCount=" << jdAttemptCount; - os << "; chunkId=" << jdChunkId; - os << "; fragmentNumber=" << fragmentNumber; - os << "; templateId=" << templateId; - os << "; noSubchunks=" << hasSubchunks; - os << "; subchunkId=" << subchunkId; - os << "; chunkQuerySpecDb?"; - os << "; scanInfo=" << *scanInfo; - os << "; scanInteractive=" << scanInteractive; - os << "; maxTableSizeMb=" << maxTableSizeMb; - os << "; fragSubTables={"; - for (auto const& fsTbl : fragSubTables) { - os << fsTbl.db << "." << fsTbl.tbl << ", "; - } - os << "}"; - os << "; fragSubchunkIds=" << util::printable(fragSubchunkIds); - os << "; sendChannel?"; - os << "; queryStats?"; - os << "; resultsHttpPort=" << resultsHttpPort; - LOGS(_log, LOG_LVL_WARN, "&&&" << os.str()); - } auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, hasSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, @@ -398,7 +344,6 @@ std::vector Task::createTasksFromUberJobMsg( std::string chunkQuerySpecDb = jobMsg->getChunkQuerySpecDb(); bool scanInteractive = jobMsg->getScanInteractive(); int chunkId = jobMsg->getChunkId(); - std::string chunkResultName = jobMsg->getChunkResultName(); std::vector chunkScanTableIndexes = jobMsg->getChunkScanTableIndexes(); auto jobFragments = jobMsg->getJobFragments(); @@ -459,23 +404,14 @@ std::vector Task::createTasksFromUberJobMsg( return vect; } -//&&& std::vector Task::createTasksForUnitTest( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, std::shared_ptr const& sendChannel, protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, - std::shared_ptr const& chunkResourceMgr - //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, - //&&&std::shared_ptr const& queriesAndChunks, - //&&&uint16_t resultsHttpPort = 8080) { -) { + std::shared_ptr const& chunkResourceMgr) { QueryId qId = ujData->getQueryId(); UberJobId ujId = ujData->getUberJobId(); CzarIdType czId = ujData->getCzarId(); - - //&&& wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); - //&&& UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); - string funcN(__func__); funcN += " QID=" + to_string(qId) + " "; @@ -535,16 +471,9 @@ std::vector Task::createTasksForUnitTest( } for (string const& fragSubQ : fragSubQueries) { - //&&&size_t templateId = userQueryInfo->addTemplate(fragSubQ); if (fragSubchunkIds.empty()) { bool const noSubchunks = false; int const subchunkId = -1; - /* &&& - auto task = Task::Ptr(new Task( - ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, - noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, - fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); - */ auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, 0, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, fragSubTables, @@ -554,13 +483,6 @@ std::vector Task::createTasksForUnitTest( } else { for (auto subchunkId : fragSubchunkIds) { bool const hasSubchunks = true; - /* &&& - auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, - fragmentNumber, templateId, hasSubchunks, subchunkId, - jdQuerySpecDb, scanInfo, scanInteractive, - maxTableSizeMb, fragSubTables, fragSubchunkIds, - sendChannel, queryStats, resultsHttpPort)); - */ auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, 0, hasSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, @@ -574,13 +496,6 @@ std::vector Task::createTasksForUnitTest( } } - /* &&& shouldn't need this - for (auto taskPtr : vect) { - // newQueryRunner sets the `_taskQueryRunner` pointer in `task`. - taskPtr->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(taskPtr, chunkResourceMgr, mySqlConfig, - sqlConnMgr, queriesAndChunks)); - } - */ return vect; } From 816da2530323ed6305f62e94781a0506df6cc156 Mon Sep 17 00:00:00 2001 From: John Gates Date: Mon, 16 Dec 2024 12:20:50 -0800 Subject: [PATCH 20/22] Removed TaskMsgFactory. --- .../templates/http/etc/qserv-czar.cnf.jinja | 2 - .../templates/proxy/etc/qserv-czar.cnf.jinja | 2 - src/cconfig/CzarConfig.h | 5 + src/ccontrol/UserQuerySelect.cc | 24 +-- src/czar/Czar.cc | 3 +- src/czar/Czar.h | 4 +- src/czar/CzarChunkMap.cc | 5 +- src/protojson/UberJobMsg.cc | 53 ++---- src/protojson/UberJobMsg.h | 4 +- src/qdisp/JobDescription.cc | 12 +- src/qdisp/JobDescription.h | 13 +- src/qdisp/JobQuery.cc | 2 +- src/qdisp/UberJob.cc | 47 +---- src/qdisp/testQDisp.cc | 25 +-- src/qproc/CMakeLists.txt | 1 - src/qproc/ChunkSpec.cc | 14 +- src/qproc/ChunkSpec.h | 2 + src/qproc/QuerySession.cc | 2 +- src/qproc/TaskMsgFactory.cc | 160 ------------------ src/qproc/TaskMsgFactory.h | 74 -------- src/wbase/Task.cc | 2 + src/wbase/Task.h | 2 + src/wdb/testQueryRunner.cc | 10 +- src/xrdsvc/HttpWorkerCzarModule.cc | 113 ------------- 24 files changed, 62 insertions(+), 519 deletions(-) delete mode 100644 src/qproc/TaskMsgFactory.cc delete mode 100644 src/qproc/TaskMsgFactory.h diff --git a/src/admin/templates/http/etc/qserv-czar.cnf.jinja b/src/admin/templates/http/etc/qserv-czar.cnf.jinja index 4f70f5b9c..92bd36c47 100644 --- a/src/admin/templates/http/etc/qserv-czar.cnf.jinja +++ b/src/admin/templates/http/etc/qserv-czar.cnf.jinja @@ -109,8 +109,6 @@ largestPriority = 3 vectRunSizes = 50:50:50:50 # Minimum number of threads running for each queue. No spaces. Values separated by ':' vectMinRunningSizes = 0:1:3:3 -# Maximum number of QueryRequests allowed to be running at one time. -qReqPseudoFifoMaxRunning = 299 [replication] diff --git a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja index 26e13346f..d2cfd205e 100644 --- a/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja +++ b/src/admin/templates/proxy/etc/qserv-czar.cnf.jinja @@ -111,8 +111,6 @@ largestPriority = 3 vectRunSizes = 800:800:500:500 # Minimum number of threads running for each queue. No spaces. Values separated by ':' vectMinRunningSizes = 0:3:3:3 -# Maximum number of QueryRequests allowed to be running at one time. &&& unused?? -qReqPseudoFifoMaxRunning = 299 [replication] diff --git a/src/cconfig/CzarConfig.h b/src/cconfig/CzarConfig.h index 9b6096531..b4d51fc7c 100644 --- a/src/cconfig/CzarConfig.h +++ b/src/cconfig/CzarConfig.h @@ -218,6 +218,9 @@ class CzarConfig { /// Return the maximum number of http connections to use for czar commands. int getCommandMaxHttpConnections() const { return _commandMaxHttpConnections->getVal(); } + /// Return the sleep time (in milliseconds) between messages sent to active workers. + int getMonitorSleepTimeMilliSec() const { return _monitorSleepTimeMilliSec->getVal(); } + // Parameters of the Czar management service std::string const& replicationInstanceId() const { return _replicationInstanceId->getVal(); } @@ -413,6 +416,8 @@ class CzarConfig { util::ConfigValTInt::create(_configValMap, "activeworker", "timeoutDeadSecs", notReq, 60 * 10); CVTIntPtr _activeWorkerMaxLifetimeSecs = // 1hr util::ConfigValTInt::create(_configValMap, "activeworker", "maxLifetimeSecs", notReq, 60 * 60); + CVTIntPtr _monitorSleepTimeMilliSec = + util::ConfigValTInt::create(_configValMap, "activeworker", "monitorSleepTimeMilliSec", notReq, 15'000); // UberJobs CVTIntPtr _uberJobMaxChunks = diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index e4394c725..02ac3cebd 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -93,7 +93,6 @@ #include "qproc/geomAdapter.h" #include "qproc/IndexMap.h" #include "qproc/QuerySession.h" -#include "qproc/TaskMsgFactory.h" #include "query/ColumnRef.h" #include "query/FromList.h" #include "query/JoinRef.h" @@ -241,7 +240,6 @@ void UserQuerySelect::submit() { LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect beginning submission"); assert(_infileMerger); - auto taskMsgFactory = std::make_shared(); _ttn = std::make_shared(_qMetaQueryId, _qSession->getOriginal()); std::vector chunks; std::mutex chunksMtx; @@ -300,8 +298,7 @@ void UserQuerySelect::submit() { ru.setAsDbChunk(cs->db, cs->chunkId); qdisp::JobDescription::Ptr jobDesc = qdisp::JobDescription::create( _qMetaCzarId, _executive->getId(), sequence, ru, - std::make_shared(_infileMerger, chunkResultName), taskMsgFactory, cs, - chunkResultName); + std::make_shared(_infileMerger, chunkResultName), cs, chunkResultName); auto job = _executive->add(jobDesc); ++sequence; } @@ -327,13 +324,11 @@ void UserQuerySelect::submit() { } util::HistogramRolling histoBuildAndS("&&&uj histoBuildAndS", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); -util::HistogramRolling histoBuildAndS1("&&&uj histoBuildAndS1", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); void UserQuerySelect::buildAndSendUberJobs() { // TODO:UJ Is special handling needed for the dummy chunk, 1234567890 ? string const funcN("UserQuerySelect::" + string(__func__) + " QID=" + to_string(_qMetaQueryId)); LOGS(_log, LOG_LVL_DEBUG, funcN << " start " << _uberJobMaxChunks); - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj start " << _uberJobMaxChunks); // Ensure `_monitor()` doesn't do anything until everything is ready. if (!_executive->isReadyToExecute()) { @@ -342,17 +337,14 @@ void UserQuerySelect::buildAndSendUberJobs() { } // Only one thread should be generating UberJobs for this user query at any given time. - util::InstanceCount ica("UserQuerySelect::buildAndSendUberJobs&&&_beforelock"); lock_guard fcLock(_buildUberJobMtx); - util::InstanceCount icb("UserQuerySelect::buildAndSendUberJobs&&&_afterlock"); LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << _executive->getTotalJobs()); vector uberJobs; qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = _executive->unassignedChunksInQuery(); if (unassignedChunksInQuery.empty()) { - LOGS(_log, LOG_LVL_TRACE, funcN << " no unassigned Jobs"); - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj no unassigned Jobs"); + LOGS(_log, LOG_LVL_DEBUG, funcN << " no unassigned Jobs"); return; } @@ -396,7 +388,6 @@ void UserQuerySelect::buildAndSendUberJobs() { qdisp::UberJob::Ptr uberJobPtr; protojson::WorkerContactInfo::Ptr wInf; }; - //&&& map> workerJobMap; map workerJobMap; vector missingChunks; @@ -406,14 +397,18 @@ void UserQuerySelect::buildAndSendUberJobs() { // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, // and should minimize the time for the first UberJob on the worker to complete. for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) { + + bool const increaseAttemptCount = true; + jqPtr->getDescription()->incrAttemptCount(_executive, increaseAttemptCount); + // If too many workers are down, there will be a chunk that cannot be found. // Just continuing should leave jobs `unassigned` with their attempt count // increased. Either the chunk will be found and jobs assigned, or the jobs' // attempt count will reach max and the query will be cancelled auto lambdaMissingChunk = [&](string const& msg) { missingChunks.push_back(chunkId); - bool const increaseAttemptCount = true; - jqPtr->getDescription()->incrAttemptCountScrubResultsJson(_executive, increaseAttemptCount); + //&&&bool const increaseAttemptCount = true; + //&&&jqPtr->getDescription()->incrAttemptCountScrubResultsJson(_executive, increaseAttemptCount); LOGS(_log, LOG_LVL_ERROR, msg); }; @@ -454,7 +449,7 @@ void UserQuerySelect::buildAndSendUberJobs() { WInfoAndUJPtr::Ptr& wInfUJ = workerJobMap[workerId]; if (wInfUJ == nullptr) { wInfUJ = make_shared(); - auto iter = wContactMap->find(workerId); //&&&auto iter = wContactMap->find(wIdKey); + auto iter = wContactMap->find(workerId); if (iter == wContactMap->end()) { // TODO:UJ Not appropriate to throw for this. Need to re-direct all jobs to different workers. // Also, this really shouldn't happen, but crashing the czar is probably a bad idea, @@ -509,7 +504,6 @@ void UserQuerySelect::buildAndSendUberJobs() { } LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); - LOGS(_log, LOG_LVL_WARN, funcN << " &&&uj " << _executive->dumpUberJobCounts()); } /// Block until a submit()'ed query completes. diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index fe62f34e3..0ec244993 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -159,6 +159,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) _idCounter(), _uqFactory(), _clientToQuery(), + _monitorSleepTime (_czarConfig->getMonitorSleepTimeMilliSec()), _activeWorkerMap(new ActiveWorkerMap(_czarConfig)) { // set id counter to milliseconds since the epoch, mod 1 year. struct timeval tv; @@ -177,7 +178,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) _czarConfig->setId(_uqFactory->userQuerySharedResources()->qMetaCzarId); // Tell workers to cancel any queries that were submitted before this restart of Czar. - // Figure out which query (if any) was recorded in Czar database before the restart. + // Figure out which query (if any) was recorded in Czar databases before the restart. // The id will be used as the high-watermark for queries that need to be cancelled. // All queries that have identifiers that are strictly less than this one will // be affected by the operation. diff --git a/src/czar/Czar.h b/src/czar/Czar.h index 408df5b10..b563c03f6 100644 --- a/src/czar/Czar.h +++ b/src/czar/Czar.h @@ -233,8 +233,8 @@ class Czar { /// Set to false on system shutdown to stop _monitorThrd. std::atomic _monitorLoop{true}; - /// Wait time between checks. TODO:UJ set from config - std::chrono::milliseconds _monitorSleepTime{15'000}; // &&& config + /// Wait time between checks to. + std::chrono::milliseconds _monitorSleepTime; /// Keeps track of all workers (alive or otherwise) that this czar /// may communicate with. Once created, the pointer never changes. diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 17dc0c277..82d8fd1e8 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -333,16 +333,13 @@ bool CzarFamilyMap::_read() { LOGS(_log, LOG_LVL_TRACE, "CzarFamilyMap::_read() start"); // If replacing the map, this may take a bit of time, but it's probably // better to wait for new maps if something changed. - std::lock_guard gLock(_familyMapMtx); + std::lock_guard gLock(_familyMapMtx); // &&& check waiting is really needed qmeta::QMetaChunkMap qChunkMap = _qmeta->getChunkMap(_lastUpdateTime); if (_lastUpdateTime == qChunkMap.updateTime) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " no need to read " << util::TimeUtils::timePointToDateTimeString(_lastUpdateTime) << " db=" << util::TimeUtils::timePointToDateTimeString(qChunkMap.updateTime)); - // &&& Should a flag be set here to alter worker aliveness check as nothing has changed? TODO:UJ - // &&& Reason being that a brief loss of the registry could leave all workers marked as dead, when - // &&& they are still alive. return false; } diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc index f12f37fe2..65564cdf4 100644 --- a/src/protojson/UberJobMsg.cc +++ b/src/protojson/UberJobMsg.cc @@ -61,19 +61,15 @@ UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationI _rowLimit(rowLimit), _maxTableSizeMB(maxTableSizeMB), _scanInfo(scanInfo_) { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg start"); for (auto& jobPtr : jobs) { // This creates the JobMsg objects for all relates jobs and their fragments. auto jobMsg = JobMsg::create(jobPtr, _jobSubQueryTempMap, _jobDbTablesMap); _jobMsgVect->push_back(jobMsg); } - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::UberJobMsg end"); } json UberJobMsg::serializeJson() const { - LOGS(_log, LOG_LVL_WARN, "&&& UberJobMsg::serializeJson a"); - json ujmJson = {{"version", _metaVersion}, {"instance_id", _replicationInstanceId}, {"auth_key", _replicationAuthKey}, @@ -98,7 +94,7 @@ json UberJobMsg::serializeJson() const { } UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { - LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson ujmJson=" << ujmJson); + LOGS(_log, LOG_LVL_TRACE, "UberJobMsg::createFromJson ujmJson=" << ujmJson); try { if (ujmJson["version"] != http::MetaModule::version) { LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson bad version " << ujmJson["version"]); @@ -145,13 +141,10 @@ UberJobMsg::Ptr UberJobMsg::createFromJson(nlohmann::json const& ujmJson) { JobMsg::createFromJson(jsUjJob, ujmPtr->_jobSubQueryTempMap, ujmPtr->_jobDbTablesMap); ujmPtr->_jobMsgVect->push_back(jobMsgPtr); } - LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson end"); - return ujmPtr; } catch (invalid_argument const& exc) { LOGS(_log, LOG_LVL_ERROR, "UberJobMsg::createFromJson invalid " << exc.what() << " json=" << ujmJson); } - LOGS(_log, LOG_LVL_DEBUG, "UberJobMsg::createFromJson end error"); return nullptr; } @@ -171,13 +164,13 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, } auto chunkQuerySpec = descr->getChunkQuerySpec(); _jobId = descr->id(); - _attemptCount = descr->getAttemptCount(); // &&& may need to increment descr->AttemptCount at this time. + _attemptCount = descr->getAttemptCount(); _chunkQuerySpecDb = chunkQuerySpec->db; _scanRating = chunkQuerySpec->scanInfo->scanRating; _scanInteractive = chunkQuerySpec->scanInteractive; _chunkId = chunkQuerySpec->chunkId; - // Add scan tables (&&& not sure is this is the same for all jobs or not) + // Add scan tables (TODO:UJ Verify this is the same for all jobs.) for (auto const& sTbl : chunkQuerySpec->scanInfo->infoTables) { int index = jobDbTablesMap->findDbTable(make_pair(sTbl.db, sTbl.table)); jobDbTablesMap->setScanRating(index, sTbl.scanRating, sTbl.lockInMemory); @@ -186,11 +179,9 @@ JobMsg::JobMsg(std::shared_ptr const& jobPtr, // Add fragments _jobFragments = JobFragment::createVect(*chunkQuerySpec, jobSubQueryTempMap, jobDbTablesMap); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::JobMsg end"); } nlohmann::json JobMsg::serializeJson() const { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson a"); auto jsJobMsg = nlohmann::json({{"jobId", _jobId}, {"attemptCount", _attemptCount}, {"querySpecDb", _chunkQuerySpecDb}, @@ -212,7 +203,6 @@ nlohmann::json JobMsg::serializeJson() const { jsqFrags.emplace_back(jFrag->serializeJson()); } - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::serializeJson end"); return jsJobMsg; } @@ -231,7 +221,6 @@ JobMsg::JobMsg(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap JobMsg::Ptr JobMsg::createFromJson(nlohmann::json const& ujJson, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson"); JobId jobId = http::RequestBodyJSON::required(ujJson, "jobId"); int attemptCount = http::RequestBodyJSON::required(ujJson, "attemptCount"); string chunkQuerySpecDb = http::RequestBodyJSON::required(ujJson, "querySpecDb"); @@ -248,35 +237,26 @@ JobMsg::Ptr JobMsg::createFromJson(nlohmann::json const& ujJson, jMsgPtr->_jobFragments = JobFragment::createVectFromJson(jsQFrags, jMsgPtr->_jobSubQueryTempMap, jMsgPtr->_jobDbTablesMap); - LOGS(_log, LOG_LVL_WARN, "&&& JobMsg::createFromJson end"); return jMsgPtr; } json JobSubQueryTempMap::serializeJson() const { - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson a"); - // std::map _qTemplateMap; json jsSubQueryTemplateMap = {{"subquerytemplate_map", json::array()}}; - - LOGS(_log, LOG_LVL_TRACE, - "&&& JobSubQueryTempMap::serializeJson jsSubQueryTemplateMap=" << jsSubQueryTemplateMap); auto& jsSqtMap = jsSubQueryTemplateMap["subquerytemplate_map"]; for (auto const& [key, templ] : _qTemplateMap) { json jsElem = {{"index", key}, {"template", templ}}; jsSqtMap.push_back(jsElem); } - LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " &&& " << jsSqtMap); - - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::serializeJson end"); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " " << jsSqtMap); return jsSubQueryTemplateMap; } JobSubQueryTempMap::Ptr JobSubQueryTempMap::createFromJson(nlohmann::json const& ujJson) { - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson a"); Ptr sqtMapPtr = create(); auto& sqtMap = sqtMapPtr->_qTemplateMap; - LOGS(_log, LOG_LVL_WARN, "&&& JobSubQueryTempMap::createFromJson " << ujJson); + LOGS(_log, LOG_LVL_TRACE, "JobSubQueryTempMap::createFromJson " << ujJson); auto const& jsElements = ujJson["subquerytemplate_map"]; for (auto const& jsElem : jsElements) { int index = http::RequestBodyJSON::required(jsElem, "index"); @@ -287,7 +267,6 @@ JobSubQueryTempMap::Ptr JobSubQueryTempMap::createFromJson(nlohmann::json const& " index already found in " + to_string(ujJson)); } } - LOGS(_log, LOG_LVL_WARN, "JobSubQueryTempMap::createFromJson end"); return sqtMapPtr; } @@ -336,8 +315,7 @@ json JobDbTablesMap::serializeJson() const { jsScanRatingMap.push_back(jsScanR); } - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& " << jsDbTablesMap); - + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " " << jsDbTablesMap); return jsDbTablesMap; } @@ -346,7 +324,7 @@ JobDbTablesMap::Ptr JobDbTablesMap::createFromJson(nlohmann::json const& ujJson) auto& dbTblMap = dbTablesMapPtr->_dbTableMap; auto& scanRMap = dbTablesMapPtr->_scanRatingMap; - LOGS(_log, LOG_LVL_WARN, "&&& JobDbTablesMap::createFromJson " << ujJson); + LOGS(_log, LOG_LVL_TRACE, "JobDbTablesMap::createFromJson " << ujJson); json const& jsDbTbl = ujJson["dbtable_map"]; for (auto const& jsElem : jsDbTbl) { @@ -405,7 +383,6 @@ JobFragment::JobFragment(JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQuerySpec, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& jobDbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect start"); VectPtr jFragments{new Vect()}; if (chunkQuerySpec.nextFragment.get()) { qproc::ChunkQuerySpec const* sPtr = &chunkQuerySpec; @@ -423,7 +400,6 @@ JobFragment::VectPtr JobFragment::createVect(qproc::ChunkQuerySpec const& chunkQ chunkQuerySpec.queries, jobSubQueryTempMap, jobDbTablesMap); } - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVect end"); return jFragments; } @@ -431,7 +407,7 @@ void JobFragment::_addFragment(std::vector& jFragments, DbTableSet const& s std::vector const& subchunkIds, std::vector const& queries, JobSubQueryTempMap::Ptr const& subQueryTemplates, JobDbTablesMap::Ptr const& dbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment a"); + LOGS(_log, LOG_LVL_TRACE, "JobFragment::_addFragment start"); Ptr jFrag = Ptr(new JobFragment(subQueryTemplates, dbTablesMap)); // queries: The query string is stored in `_jobSubQueryTempMap` and the list of @@ -439,7 +415,7 @@ void JobFragment::_addFragment(std::vector& jFragments, DbTableSet const& s for (auto& qry : queries) { int index = jFrag->_jobSubQueryTempMap->findSubQueryTemp(qry); jFrag->_jobSubQueryTempIndexes.push_back(index); - LOGS(_log, LOG_LVL_TRACE, jFrag->cName(__func__) << "&&& added frag=" << qry << " index=" << index); + LOGS(_log, LOG_LVL_TRACE, jFrag->cName(__func__) << " added frag=" << qry << " index=" << index); } // Add the db+table pairs to the subchunks for the fragment. @@ -457,7 +433,6 @@ void JobFragment::_addFragment(std::vector& jFragments, DbTableSet const& s } jFragments.push_back(move(jFrag)); - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::_addFragment end"); } string JobFragment::dump() const { @@ -490,7 +465,7 @@ nlohmann::json JobFragment::serializeJson() const { JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFrags, JobSubQueryTempMap::Ptr const& jobSubQueryTempMap, JobDbTablesMap::Ptr const& dbTablesMap) { - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson " << jsFrags); + LOGS(_log, LOG_LVL_TRACE, "JobFragment::createVectFromJson " << jsFrags); JobFragment::VectPtr jobFragments{new JobFragment::Vect()}; @@ -501,7 +476,7 @@ JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFra for (int j : jobFrag->_jobSubQueryTempIndexes) { try { string tem = jobSubQueryTempMap->getSubQueryTemp(j); - LOGS(_log, LOG_LVL_WARN, jobFrag->cName(__func__) << " &&&T j=" << j << " =" << tem); + LOGS(_log, LOG_LVL_TRACE, jobFrag->cName(__func__) << " j=" << j << " =" << tem); } catch (std::out_of_range const& ex) { LOGS(_log, LOG_LVL_ERROR, jobFrag->cName(__func__) << " index=" << j << " not found in template map " << jsFrag); @@ -515,9 +490,9 @@ JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFra for (int j : jobFrag->_jobDbTablesIndexes) { try { auto dbTblPr = dbTablesMap->getDbTable(j); - LOGS(_log, LOG_LVL_WARN, + LOGS(_log, LOG_LVL_TRACE, jobFrag->cName(__func__) - << " &&&T j=" << j << " =" << dbTblPr.first << "." << dbTblPr.second); + << " j=" << j << " =" << dbTblPr.first << "." << dbTblPr.second); } catch (std::out_of_range const& ex) { LOGS(_log, LOG_LVL_ERROR, jobFrag->cName(__func__) << " index=" << j << " not found in dbTable map " << jsFrag); @@ -530,8 +505,6 @@ JobFragment::VectPtr JobFragment::createVectFromJson(nlohmann::json const& jsFra jobFrag->_subchunkIds = jsFrag["subchunkids"].get>(); jobFragments->push_back(jobFrag); } - - LOGS(_log, LOG_LVL_WARN, "&&& JobFragment::createVectFromJson end"); return jobFragments; } diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h index 73c6276d9..d5f6ade9e 100644 --- a/src/protojson/UberJobMsg.h +++ b/src/protojson/UberJobMsg.h @@ -21,8 +21,6 @@ #ifndef LSST_QSERV_PROTOJSON_UBERJOBMSG_H #define LSST_QSERV_PROTOJSON_UBERJOBMSG_H -#define NEWMSGUJ 0 // &&& delete - // System headers #include #include @@ -109,7 +107,7 @@ class JobDbTablesMap { // &&& this class can probably be deleted /// @throws std::out_of_range std::pair getDbTable(int index) { return _dbTableMap.at(index); } - /// &&& doc + /// &&& TODO:UJ compare with scan rating for entire UberJob void setScanRating(int index, int scanRating, bool lockInMemory); /// Return scanRating(int) and lockInMemory(bool) for the dbTable at `index`. diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index 42b7cbfa6..660e57330 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -38,7 +38,6 @@ #include "qdisp/Executive.h" #include "qdisp/ResponseHandler.h" #include "qproc/ChunkQuerySpec.h" -#include "qproc/TaskMsgFactory.h" using namespace std; @@ -50,7 +49,6 @@ namespace lsst::qserv::qdisp { JobDescription::JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, ResourceUnit const& resource, shared_ptr const& respHandler, - shared_ptr const& taskMsgFactory, shared_ptr const& chunkQuerySpec, string const& chunkResultName, bool mock) : _czarId(czarId), @@ -59,12 +57,11 @@ JobDescription::JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, R _qIdStr(QueryIdHelper::makeIdStr(_queryId, _jobId)), _resource(resource), _respHandler(respHandler), - _taskMsgFactory(taskMsgFactory), _chunkQuerySpec(chunkQuerySpec), _chunkResultName(chunkResultName), _mock(mock) {} -bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase) { +bool JobDescription::incrAttemptCount(std::shared_ptr const& exec, bool increase) { if (increase) { ++_attemptCount; } @@ -87,13 +84,6 @@ bool JobDescription::incrAttemptCountScrubResultsJson(std::shared_ptr return false; } } - - // build the request - auto js = _taskMsgFactory->makeMsgJson(*_chunkQuerySpec, _chunkResultName, _queryId, - _jobId, // &&& should be able to delete this - _attemptCount, _czarId); - LOGS(_log, LOG_LVL_DEBUG, "JobDescription::" << __func__ << " js=" << (*js)); - _jsForWorker = js; // &&& should be able to delete _jsForWorker return true; } diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index 635540297..75ca4a33b 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -45,13 +45,8 @@ namespace lsst::qserv { -namespace proto { -class TaskMsg; -} - namespace qproc { class ChunkQuerySpec; -class TaskMsgFactory; } // namespace qproc namespace qdisp { @@ -66,10 +61,9 @@ class JobDescription { static JobDescription::Ptr create(qmeta::CzarId czarId, QueryId qId, JobId jobId, ResourceUnit const& resource, std::shared_ptr const& respHandler, - std::shared_ptr const& taskMsgFactory, std::shared_ptr const& chunkQuerySpec, std::string const& chunkResultName, bool mock = false) { - JobDescription::Ptr jd(new JobDescription(czarId, qId, jobId, resource, respHandler, taskMsgFactory, + JobDescription::Ptr jd(new JobDescription(czarId, qId, jobId, resource, respHandler, chunkQuerySpec, chunkResultName, mock)); return jd; } @@ -88,8 +82,7 @@ class JobDescription { int getScanRating() const; /// Increase the attempt count by 1 and return false if that puts it over the limit. - /// TODO:UJ scrubbing results unneeded with uj. This should be renamed. - bool incrAttemptCountScrubResultsJson(std::shared_ptr const& exec, bool increase); + bool incrAttemptCount(std::shared_ptr const& exec, bool increase); std::shared_ptr getJsForWorker() { return _jsForWorker; } @@ -100,7 +93,6 @@ class JobDescription { private: JobDescription(qmeta::CzarId czarId, QueryId qId, JobId jobId, ResourceUnit const& resource, std::shared_ptr const& respHandler, - std::shared_ptr const& taskMsgFactory, std::shared_ptr const& chunkQuerySpec, std::string const& chunkResultName, bool mock = false); @@ -112,7 +104,6 @@ class JobDescription { ResourceUnit _resource; ///< path, e.g. /q/LSST/23125 std::shared_ptr _respHandler; // probably MergingHandler - std::shared_ptr _taskMsgFactory; std::shared_ptr _chunkQuerySpec; std::string _chunkResultName; diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 62e281d59..71d9f19ec 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -130,7 +130,7 @@ bool JobQuery::unassignFromUberJob(UberJobId ujId) { auto exec = _executive.lock(); // Do not increase the count as it should have been increased when the job was started. - _jobDescription->incrAttemptCountScrubResultsJson(exec, false); + _jobDescription->incrAttemptCount(exec, false); return true; } diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 0765a14ce..10f535ff1 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -109,52 +109,7 @@ void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelle nlohmann::json uj; unique_lock jobsLock(_jobsMtx); auto exec = _executive.lock(); -#if NEWMSGUJ // &&& - for (auto const& jqPtr : _jobs) { - jqPtr->getDescription()->incrAttemptCountScrubResultsJson(exec, true); - } - - // Send the uberjob to the worker - auto const method = http::Method::POST; - auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); - string const url = "http://" + ciwHost + ":" + to_string(ciwPort) + "/queryjob"; - vector const headers = {"Content-Type: application/json"}; - auto const& czarConfig = cconfig::CzarConfig::instance(); - // See xrdsvc::httpWorkerCzarModule::_handleQueryJob for json message parsing. - json request = {{"version", http::MetaModule::version}, - {"instance_id", czarConfig->replicationInstanceId()}, - {"auth_key", czarConfig->replicationAuthKey()}, - {"worker", ciwId}, - {"czarinfo", - {{"name", czarConfig->name()}, - {"id", czarConfig->id()}, - {"management-port", czarConfig->replicationHttpPort()}, - {"management-host-name", util::get_current_host_fqdn()}}}, - {"uberjob", - {{"queryid", _queryId}, - {"uberjobid", _uberJobId}, - {"czarid", _czarId}, - {"rowlimit", _rowLimit}, - {"jobs", json::array()}}}}; - - auto& jsUberJob = request["uberjob"]; - auto& jsJobs = jsUberJob["jobs"]; - for (auto const& jbPtr : _jobs) { - auto const description = jbPtr->getDescription(); - if (description == nullptr) { - throw util::Bug(ERR_LOC, cName(__func__) + " description=null for job=" + jbPtr->getIdStr()); - } - auto const jsForWorker = jbPtr->getDescription()->getJsForWorker(); - if (jsForWorker == nullptr) { - throw util::Bug(ERR_LOC, cName(__func__) + " jsForWorker=null for job=" + jbPtr->getIdStr()); - } - json jsJob = {{"jobdesc", *jsForWorker}}; - jsJobs.push_back(jsJob); - jbPtr->getDescription()->resetJsForWorker(); // no longer needed. - } -#else // &&& - //&&&LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj a"); // Send the uberjob to the worker auto const method = http::Method::POST; auto [ciwId, ciwHost, ciwManagment, ciwPort] = _wContactInfo->getAll(); @@ -177,7 +132,7 @@ void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelle std::chrono::duration secsserialize = endserialize - startserialize; // &&& histoUJSerialize.addEntry(endserialize, secsserialize.count()); //&&& LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoUJSerialize.getString("")); -#endif // &&& + jobsLock.unlock(); // unlock so other _jobsMtx threads can advance while this waits for transmit LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj c"); /* &&& diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 005e7d934..bda0a020f 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -43,7 +43,6 @@ #include "qdisp/JobQuery.h" #include "qmeta/MessageStore.h" #include "qproc/ChunkQuerySpec.h" -#include "qproc/TaskMsgFactory.h" #include "util/QdispPool.h" #include "util/threadSafe.h" @@ -58,27 +57,6 @@ LOG_LOGGER _log = LOG_GET("lsst.qserv.qdisp.testQDisp"); typedef util::Sequential SequentialInt; typedef vector RequesterVector; -namespace lsst::qserv::qproc { - -// Normally, there's one TaskMsgFactory that all jobs in a user query share. -// In this case, there's one MockTaskMsgFactory per job with a payload specific -// for that job. -class MockTaskMsgFactory : public TaskMsgFactory { -public: - MockTaskMsgFactory(std::string const& mockPayload_) : TaskMsgFactory(), mockPayload(mockPayload_) {} - - shared_ptr makeMsgJson(ChunkQuerySpec const& s, std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, - qmeta::CzarId czarId) override { - return jsPtr; - } - - string mockPayload; - shared_ptr jsPtr; -}; - -} // namespace lsst::qserv::qproc - namespace lsst::qserv::qdisp { class ExecutiveUT; @@ -201,11 +179,10 @@ class ExecutiveUT : public Executive { qdisp::JobDescription::Ptr makeMockJobDescription(qdisp::Executive::Ptr const& ex, int sequence, ResourceUnit const& ru, std::string msg, std::shared_ptr const& mHandler) { - auto mockTaskMsgFactory = std::make_shared(msg); auto cqs = std::make_shared(); // dummy, unused in this case. std::string chunkResultName = "dummyResultTableName"; qmeta::CzarId const czarId = 1; - auto job = qdisp::JobDescription::create(czarId, ex->getId(), sequence, ru, mHandler, mockTaskMsgFactory, + auto job = qdisp::JobDescription::create(czarId, ex->getId(), sequence, ru, mHandler, cqs, chunkResultName, true); return job; } diff --git a/src/qproc/CMakeLists.txt b/src/qproc/CMakeLists.txt index db311c4ab..9aecaafca 100644 --- a/src/qproc/CMakeLists.txt +++ b/src/qproc/CMakeLists.txt @@ -8,7 +8,6 @@ target_sources(qproc PRIVATE IndexMap.cc QuerySession.cc SecondaryIndex.cc - TaskMsgFactory.cc ) target_link_libraries(qproc PRIVATE diff --git a/src/qproc/ChunkSpec.cc b/src/qproc/ChunkSpec.cc index 0d1d0dba5..fa9a8132f 100644 --- a/src/qproc/ChunkSpec.cc +++ b/src/qproc/ChunkSpec.cc @@ -44,7 +44,15 @@ namespace { // File-scope helpers /// A "good" number of subchunks to include in a chunk query. This is /// a guess. The best value is an open question -int const GOOD_SUBCHUNK_COUNT = 20; +// TODO:UJ `ChunkSpecFragmenter` has the purpose of limiting the +// number of subchunks per ChunkSpec (which works out to +// subchunkids per Job). +// Each subchunk gets its own task on the worker, so this +// is probably no longer helpful. Making the limit absurdly +// high should have the effect of disabling the code +// while checking if there are unexpected side effects. +// int const GOOD_SUBCHUNK_COUNT = 20; +int const GOOD_SUBCHUNK_COUNT = 2'000'000; } // namespace namespace lsst::qserv::qproc { @@ -121,9 +129,7 @@ void normalize(ChunkSpecVector& specs) { //////////////////////////////////////////////////////////////////////// // ChunkSpec //////////////////////////////////////////////////////////////////////// -//&&&bool ChunkSpec::shouldSplit() const { return subChunks.size() > (unsigned)GOOD_SUBCHUNK_COUNT; } -//&&& subchunks are handled in their own tasks now, so there's no point in splitting anymore. -bool ChunkSpec::shouldSplit() const { return false; } +bool ChunkSpec::shouldSplit() const { return subChunks.size() > (unsigned)GOOD_SUBCHUNK_COUNT; } ChunkSpec ChunkSpec::intersect(ChunkSpec const& cs) const { ChunkSpec output(*this); diff --git a/src/qproc/ChunkSpec.h b/src/qproc/ChunkSpec.h index 9bf31053e..777cd9d87 100644 --- a/src/qproc/ChunkSpec.h +++ b/src/qproc/ChunkSpec.h @@ -93,6 +93,8 @@ ChunkSpecVector intersect(ChunkSpecVector const& a, ChunkSpecVector const& b); void normalize(ChunkSpecVector& specs); /// An iterating fragmenter to reduce the number of subChunkIds per ChunkSpec +/// TODO:UJ Fragmenting the the Jobs probably no longer makes sense, see +/// `GOOD_SUBCHUNK_COUNT` definition. class ChunkSpecFragmenter { public: ChunkSpecFragmenter(ChunkSpec const& s); diff --git a/src/qproc/QuerySession.cc b/src/qproc/QuerySession.cc index 7099e2647..9bd643265 100644 --- a/src/qproc/QuerySession.cc +++ b/src/qproc/QuerySession.cc @@ -418,7 +418,7 @@ ChunkQuerySpec::Ptr QuerySession::buildChunkQuerySpec(query::QueryTemplate::Vect if (!_context->hasSubChunks()) { cQSpec->queries = _buildChunkQueries(queryTemplates, chunkSpec); } else { - if (chunkSpec.shouldSplit()) { //&&& remove case + if (chunkSpec.shouldSplit()) { ChunkSpecFragmenter frag(chunkSpec); ChunkSpec s = frag.get(); cQSpec->queries = _buildChunkQueries(queryTemplates, s); diff --git a/src/qproc/TaskMsgFactory.cc b/src/qproc/TaskMsgFactory.cc deleted file mode 100644 index a5dd4a97a..000000000 --- a/src/qproc/TaskMsgFactory.cc +++ /dev/null @@ -1,160 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2013-2017 AURA/LSST. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -/** - * @file - * - * @brief TaskMsgFactory is a factory for TaskMsg (protobuf) objects. - * - * @author Daniel L. Wang, SLAC - */ - -// Class header -#include "qproc/TaskMsgFactory.h" - -// System headers -#include - -// Third-party headers -#include "nlohmann/json.hpp" - -// LSST headers -#include "lsst/log/Log.h" - -// Qserv headers -#include "cconfig/CzarConfig.h" -#include "global/intTypes.h" -#include "qmeta/types.h" -#include "qproc/ChunkQuerySpec.h" -#include "util/common.h" - -namespace { -LOG_LOGGER _log = LOG_GET("lsst.qserv.qproc.TaskMsgFactory"); -} - -using namespace std; - -namespace lsst::qserv::qproc { - -std::shared_ptr TaskMsgFactory::makeMsgJson(ChunkQuerySpec const& chunkQuerySpec, - std::string const& chunkResultName, - QueryId queryId, int jobId, int attemptCount, - qmeta::CzarId czarId) { - // TODO:UJ DM-45384 &&& remove duplicate elements from the json message - // TODO:UJ &&& see: JobDescription::incrAttemptCountScrubResultsJson - // TODO:UJ &&& see: wbase::UberJobData::create - // TODO:UJ &&& see: Task::createTasksForChunk - // TODO:UJ &&& see: wdb/testQueryRunner.cc - // TODO:UJ &&& see: wsched/testSchedulers.cc - std::string resultTable("Asdfasfd"); - if (!chunkResultName.empty()) { - resultTable = chunkResultName; - } - - // TODO:UJ verify that these can be put in the uberjob to reduce duplicates - // and the size of the message. - auto jsJobMsgPtr = std::shared_ptr( - new nlohmann::json({{"czarId", czarId}, - {"queryId", queryId}, - {"jobId", jobId}, - {"attemptCount", attemptCount}, - {"querySpecDb", chunkQuerySpec.db}, - {"scanPriority", chunkQuerySpec.scanInfo->scanRating}, //&&& del ??? - {"scanInteractive", chunkQuerySpec.scanInteractive}, //&&& del ??? - {"maxTableSize", (cconfig::CzarConfig::instance()->getMaxTableSizeMB())}, - {"chunkScanTables", nlohmann::json::array()}, - {"chunkId", chunkQuerySpec.chunkId}, - {"queryFragments", nlohmann::json::array()}})); - - auto& jsJobMsg = *jsJobMsgPtr; - - auto& chunkScanTables = jsJobMsg["chunkScanTables"]; - for (auto const& sTbl : chunkQuerySpec.scanInfo->infoTables) { //&&& probably redundant - nlohmann::json cst = {{"db", sTbl.db}, - {"table", sTbl.table}, - {"lockInMemory", sTbl.lockInMemory}, - {"tblScanRating", sTbl.scanRating}}; - chunkScanTables.push_back(move(cst)); - } - - auto& jsFragments = jsJobMsg["queryFragments"]; - if (chunkQuerySpec.nextFragment.get()) { - ChunkQuerySpec const* sPtr = &chunkQuerySpec; - while (sPtr) { - LOGS(_log, LOG_LVL_TRACE, "nextFragment"); - for (unsigned int t = 0; t < (sPtr->queries).size(); t++) { - LOGS(_log, LOG_LVL_DEBUG, __func__ << " q=" << (sPtr->queries).at(t)); - } - for (auto const& sbi : sPtr->subChunkIds) { - LOGS(_log, LOG_LVL_DEBUG, __func__ << " sbi=" << sbi); - } - // Linked fragments will not have valid subChunkTables vectors, - // So, we reuse the root fragment's vector. - _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, sPtr->subChunkIds, - sPtr->queries); - sPtr = sPtr->nextFragment.get(); - } - } else { - LOGS(_log, LOG_LVL_TRACE, "no nextFragment"); - for (unsigned int t = 0; t < (chunkQuerySpec.queries).size(); t++) { - LOGS(_log, LOG_LVL_TRACE, (chunkQuerySpec.queries).at(t)); - } - _addFragmentJson(jsFragments, resultTable, chunkQuerySpec.subChunkTables, chunkQuerySpec.subChunkIds, - chunkQuerySpec.queries); - } - - return jsJobMsgPtr; -} - -void TaskMsgFactory::_addFragmentJson(nlohmann::json& jsFragments, std::string const& resultName, - DbTableSet const& subChunkTables, std::vector const& subchunkIds, - std::vector const& queries) { - nlohmann::json jsFrag = {{"resultTable", resultName}, - {"queries", nlohmann::json::array()}, - {"subchunkTables", nlohmann::json::array()}, - {"subchunkIds", nlohmann::json::array()}}; - - auto& jsQueries = jsFrag["queries"]; - for (auto& qry : queries) { - nlohmann::json jsQry = {{"subQuery", qry}}; - jsQueries.push_back(move(jsQry)); - } - - // Add the db+table pairs to the subchunk. - auto& jsSubchunkTables = jsFrag["subchunkTables"]; - for (auto& tbl : subChunkTables) { - nlohmann::json jsSubchunkTbl = {{"scDb", tbl.db}, {"scTable", tbl.table}}; - jsSubchunkTables.push_back(move(jsSubchunkTbl)); - LOGS(_log, LOG_LVL_TRACE, "added dbtbl=" << tbl.db << "." << tbl.table); - } - - // Add subchunk id numbers - auto& jsSubchunkIds = jsFrag["subchunkIds"]; - for (auto& subchunkId : subchunkIds) { - jsSubchunkIds.push_back(subchunkId); - } - - jsFragments.push_back(move(jsFrag)); -} - -} // namespace lsst::qserv::qproc diff --git a/src/qproc/TaskMsgFactory.h b/src/qproc/TaskMsgFactory.h deleted file mode 100644 index fe1f921f8..000000000 --- a/src/qproc/TaskMsgFactory.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- LSST-C++ -*- -/* - * LSST Data Management System - * Copyright 2013-2017 LSST Corporation. - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -#ifndef LSST_QSERV_QPROC_TASKMSGFACTORY_H -#define LSST_QSERV_QPROC_TASKMSGFACTORY_H -/** - * @file - * - * @brief TaskMsgFactory is a factory for TaskMsg (protobuf) objects. - * - * @author Daniel L. Wang, SLAC - */ - -// System headers -#include -#include - -// Third party headers -#include "nlohmann/json.hpp" - -// Qserv headers -#include "global/DbTable.h" -#include "global/intTypes.h" -#include "proto/worker.pb.h" -#include "qmeta/types.h" - -namespace lsst::qserv::qproc { - -class ChunkQuerySpec; - -/// TaskMsgFactory makes json messages for the jobs to be sent to the workers, where -/// they will be used to create Tasks. -class TaskMsgFactory { -public: - using Ptr = std::shared_ptr; - - TaskMsgFactory() = default; - virtual ~TaskMsgFactory() {} - - /// Make and return the json message for a single Job. - virtual std::shared_ptr makeMsgJson(ChunkQuerySpec const& s, - std::string const& chunkResultName, QueryId queryId, - int jobId, int attemptCount, qmeta::CzarId czarId); - -private: - /// Make a json message for a single fragment. - void _addFragmentJson(nlohmann::json& jsFragments, std::string const& resultName, - DbTableSet const& subChunkTables, std::vector const& subChunkIds, - std::vector const& queries); -}; - -} // namespace lsst::qserv::qproc - -#endif // LSST_QSERV_QPROC_TASKMSGFACTORY_H diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 0ea23c602..ca2dbf8e7 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -195,6 +195,7 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun Task::~Task() {} +/* &&& std::vector Task::createTasksForChunk( std::shared_ptr const& ujData, nlohmann::json const& jsJobs, std::shared_ptr const& sendChannel, @@ -301,6 +302,7 @@ std::vector Task::createTasksForChunk( } return vect; } +*/ std::vector Task::createTasksFromUberJobMsg( std::shared_ptr const& ujMsg, std::shared_ptr const& ujData, diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 8cd661d53..0711cfe9f 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -168,6 +168,7 @@ class Task : public util::CommandForThreadPool { Task(const Task&) = delete; virtual ~Task(); +/* &&& /// Read json to generate a vector of one or more task for a chunk. static std::vector createTasksForChunk( /// &&& delete std::shared_ptr const& ujData, nlohmann::json const& jsJobs, @@ -177,6 +178,7 @@ class Task : public util::CommandForThreadPool { mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort = 8080); +*/ /// &&& static std::vector createTasksFromUberJobMsg( diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index a3da9eb5b..c67acf74a 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -168,8 +168,9 @@ BOOST_AUTO_TEST_CASE(Simple) { scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); vector taskVect = - Task::createTasksForChunk(ujData, *msgJson, sChannel, scanInfo, mInfo.scanInteractive, - mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); + Task::createTasksForUnitTest(ujData, *msgJson, sChannel, scanInfo, mInfo.scanInteractive, + mInfo.maxTableSize, crm); + Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); @@ -193,8 +194,9 @@ BOOST_AUTO_TEST_CASE(Output) { scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); vector taskVect = - Task::createTasksForChunk(ujData, *msgJson, sc, scanInfo, mInfo.scanInteractive, - mInfo.maxTableSize, crm, newMySqlConfig(), sqlConnMgr, queries); + Task::createTasksForUnitTest(ujData, *msgJson, sc, scanInfo, mInfo.scanInteractive, + mInfo.maxTableSize, crm); + Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); BOOST_CHECK(a->runQuery()); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 652876ed5..8a4aa910b 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -110,109 +110,6 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { json jsRet; vector ujTasks; try { -#if NEWMSGUJ // &&& - // See qdisp::UberJob::runUberJob() for json message construction. - auto const& jsReq = body().objJson; - string const targetWorkerId = body().required("worker"); - - http::RequestBodyJSON rbCzar(body().required("czarinfo")); - auto czarName = rbCzar.required("name"); - auto czarId = rbCzar.required("id"); - auto czarPort = rbCzar.required("management-port"); - auto czarHostName = rbCzar.required("management-host-name"); - LOGS(_log, LOG_LVL_TRACE, - __func__ << " czar n=" << czarName << " id=" << czarId << " p=" << czarPort - << " h=" << czarHostName); - http::RequestBodyJSON rbUberJob(body().required("uberjob")); - auto ujQueryId = rbUberJob.required("queryid"); - auto ujId = rbUberJob.required("uberjobid"); - auto ujCzarId = rbUberJob.required("czarid"); - auto ujRowLimit = rbUberJob.required("rowlimit"); - auto ujJobs = rbUberJob.required("jobs"); - LOGS(_log, LOG_LVL_TRACE, - __func__ << " uj qid=" << ujQueryId << " ujid=" << ujId << " czid=" << ujCzarId - << " rowlimit=" << ujRowLimit); - - // Get or create QueryStatistics and UserQueryInfo instances. - auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzarId); - auto userQueryInfo = queryStats->getUserQueryInfo(); - - if (userQueryInfo->getCancelledByCzar()) { - throw wbase::TaskException( - ERR_LOC, string("Already cancelled by czar. ujQueryId=") + to_string(ujQueryId)); - } - if (userQueryInfo->isUberJobDead(ujId)) { - throw wbase::TaskException(ERR_LOC, string("UberJob already dead. ujQueryId=") + - to_string(ujQueryId) + " ujId=" + to_string(ujId)); - } - - auto ujData = wbase::UberJobData::create(ujId, czarName, czarId, czarHostName, czarPort, ujQueryId, - ujRowLimit, targetWorkerId, foreman(), authKey()); - - // Find the entry for this queryId, creat a new one if needed. - userQueryInfo->addUberJob(ujData); - auto channelShared = - wbase::FileChannelShared::create(ujData, czarId, czarHostName, czarPort, targetWorkerId); - ujData->setFileChannelShared(channelShared); - - // TODO:UJ These items should be stored higher in the message structure as they get - // duplicated and should always be the same within an UberJob. - QueryId jdQueryId = 0; - auto scanInfo = protojson::ScanInfo::create(); - bool scanInfoSet = false; - bool jdScanInteractive = false; - int jdMaxTableSize = 0; - - for (auto const& job : ujJobs) { - json const& jsJobDesc = job["jobdesc"]; - http::RequestBodyJSON rbJobDesc(jsJobDesc); - // See qproc::TaskMsgFactory::makeMsgJson for message construction. - auto const jdCzarId = rbJobDesc.required("czarId"); - jdQueryId = rbJobDesc.required("queryId"); - auto const jdJobId = rbJobDesc.required("jobId"); - auto const jdAttemptCount = rbJobDesc.required("attemptCount"); - auto const jdQuerySpecDb = rbJobDesc.required("querySpecDb"); - auto const jdScanPriority = rbJobDesc.required("scanPriority"); - jdScanInteractive = rbJobDesc.required("scanInteractive"); - jdMaxTableSize = rbJobDesc.required("maxTableSize"); - auto const jdChunkId = rbJobDesc.required("chunkId"); - LOGS(_log, LOG_LVL_TRACE, - __func__ << " jd cid=" << jdCzarId << " jdQId=" << jdQueryId << " jdJobId=" << jdJobId - << " jdAtt=" << jdAttemptCount << " jdQDb=" << jdQuerySpecDb - << " jdScanPri=" << jdScanPriority << " interactive=" << jdScanInteractive - << " maxTblSz=" << jdMaxTableSize << " chunkId=" << jdChunkId); - - auto const jdChunkScanTables = rbJobDesc.required("chunkScanTables"); - if (!scanInfoSet) { - for (auto const& tbl : jdChunkScanTables) { - http::RequestBodyJSON rbTbl(tbl); - auto const& chunkScanDb = rbTbl.required("db"); - auto lockInMemory = rbTbl.required("lockInMemory"); - auto const& chunkScanTable = rbTbl.required("table"); - auto tblScanRating = rbTbl.required("tblScanRating"); - LOGS(_log, LOG_LVL_TRACE, - __func__ << " chunkSDb=" << chunkScanDb << " lockinmem=" << lockInMemory - << " csTble=" << chunkScanTable << " tblScanRating=" << tblScanRating); - scanInfo->infoTables.emplace_back(chunkScanDb, chunkScanTable, lockInMemory, - tblScanRating); - scanInfoSet = true; - } - } - scanInfo->scanRating = jdScanPriority; - } - - ujData->setScanInteractive(jdScanInteractive); - - // create tasks and add them to ujData - auto chunkTasks = wbase::Task::createTasksForChunk( - ujData, ujJobs, channelShared, scanInfo, jdScanInteractive, jdMaxTableSize, - foreman()->chunkResourceMgr(), foreman()->mySqlConfig(), foreman()->sqlConnMgr(), - foreman()->queriesAndChunks(), foreman()->httpPort()); - ujTasks.insert(ujTasks.end(), chunkTasks.begin(), chunkTasks.end()); - - channelShared->setTaskCount(ujTasks.size()); - ujData->addTasks(ujTasks); -#else // &&& auto const& jsReq = body().objJson; auto uberJobMsg = protojson::UberJobMsg::createFromJson(jsReq); @@ -236,20 +133,12 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { to_string(ujQueryId) + " ujId=" + to_string(ujId)); } - /* &&& - auto ujData = wbase::UberJobData::create(ujId, czarName, czarId, czarHostName, czarPort, ujQueryId, - ujRowLimit, targetWorkerId, foreman(), authKey()); - */ auto ujData = wbase::UberJobData::create(ujId, ujCzInfo->czName, ujCzInfo->czId, ujCzInfo->czHostName, ujCzInfo->czPort, ujQueryId, ujRowLimit, targetWorkerId, foreman(), authKey()); // Find the entry for this queryId, create a new one if needed. userQueryInfo->addUberJob(ujData); - /* &&& - auto channelShared = - wbase::FileChannelShared::create(ujData, czarId, czarHostName, czarPort, targetWorkerId); - */ auto channelShared = wbase::FileChannelShared::create(ujData, ujCzInfo->czId, ujCzInfo->czHostName, ujCzInfo->czPort, targetWorkerId); @@ -261,8 +150,6 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { channelShared->setTaskCount(ujTasks.size()); ujData->addTasks(ujTasks); -#endif //&&& - // At this point, it looks like the message was sent successfully, update // czar touched time. wcontrol::WCzarInfoMap::Ptr wCzarMap = foreman()->getWCzarInfoMap(); From d4bf9e953cca8d78f65c564d2629304384a5f4ff Mon Sep 17 00:00:00 2001 From: John Gates Date: Wed, 18 Dec 2024 09:37:50 -0800 Subject: [PATCH 21/22] Changed Czar to catch 5GB limit. --- src/cconfig/CzarConfig.h | 7 +- src/ccontrol/MergingHandler.cc | 22 +++-- src/ccontrol/MergingHandler.h | 1 + src/ccontrol/UserQuerySelect.cc | 79 +++++++++++------- src/czar/Czar.cc | 24 +++--- src/czar/CzarChunkMap.cc | 2 +- src/czar/HttpCzarWorkerModule.cc | 4 + src/protojson/UberJobMsg.cc | 6 +- src/protojson/UberJobMsg.h | 11 ++- src/qana/QueryMapping.h | 2 - src/qdisp/Executive.cc | 54 ++++++++----- src/qdisp/Executive.h | 9 +++ src/qdisp/JobDescription.h | 4 +- src/qdisp/UberJob.cc | 36 ++++++--- src/qdisp/UberJob.h | 10 ++- src/qdisp/testQDisp.cc | 4 +- src/qproc/ChunkQuerySpec.h | 3 +- src/rproc/InfileMerger.cc | 27 ++++++- src/rproc/InfileMerger.h | 1 + src/wbase/FileChannelShared.cc | 5 +- src/wbase/Task.cc | 124 ++++------------------------- src/wbase/Task.h | 24 ++---- src/wbase/UberJobData.cc | 15 ++-- src/wbase/UberJobData.h | 2 +- src/wdb/QueryRunner.cc | 7 +- src/wdb/QueryRunner.h | 3 +- src/wdb/testQueryRunner.cc | 10 +-- src/xrdsvc/HttpWorkerCzarModule.cc | 6 +- 28 files changed, 251 insertions(+), 251 deletions(-) diff --git a/src/cconfig/CzarConfig.h b/src/cconfig/CzarConfig.h index b4d51fc7c..4b0c1cde3 100644 --- a/src/cconfig/CzarConfig.h +++ b/src/cconfig/CzarConfig.h @@ -118,7 +118,7 @@ class CzarConfig { */ std::string const& getXrootdFrontendUrl() const { return _xrootdFrontendUrl->getVal(); } - /* Get the maximum number of threads for xrootd to use. + /* Get the maximum number of threads for xrootd to use. // TODO:UJ delete * * @return the maximum number of threads for xrootd to use. */ @@ -371,6 +371,7 @@ class CzarConfig { CVTStrPtr _qdispVectMinRunningSizes = util::ConfigValTStr::create(_configValMap, "qdisppool", "vectMinRunningSizes", notReq, "0:3:3:3"); + // TODO:UJ delete xrootd specific entries. CVTIntPtr _xrootdSpread = util::ConfigValTInt::create(_configValMap, "tuning", "xrootdSpread", notReq, 4); CVTIntPtr _qMetaSecsBetweenChunkCompletionUpdates = util::ConfigValTInt::create( _configValMap, "tuning", "qMetaSecsBetweenChunkCompletionUpdates", notReq, 60); @@ -416,8 +417,8 @@ class CzarConfig { util::ConfigValTInt::create(_configValMap, "activeworker", "timeoutDeadSecs", notReq, 60 * 10); CVTIntPtr _activeWorkerMaxLifetimeSecs = // 1hr util::ConfigValTInt::create(_configValMap, "activeworker", "maxLifetimeSecs", notReq, 60 * 60); - CVTIntPtr _monitorSleepTimeMilliSec = - util::ConfigValTInt::create(_configValMap, "activeworker", "monitorSleepTimeMilliSec", notReq, 15'000); + CVTIntPtr _monitorSleepTimeMilliSec = util::ConfigValTInt::create( + _configValMap, "activeworker", "monitorSleepTimeMilliSec", notReq, 15'000); // UberJobs CVTIntPtr _uberJobMaxChunks = diff --git a/src/ccontrol/MergingHandler.cc b/src/ccontrol/MergingHandler.cc index 9a6ee5b5c..aaa940049 100644 --- a/src/ccontrol/MergingHandler.cc +++ b/src/ccontrol/MergingHandler.cc @@ -123,6 +123,10 @@ std::tuple readHttpFileAndMergeHttp( int headerCount = 0; uint64_t totalBytesRead = 0; try { + auto exec = uberJob->getExecutive(); + if (exec == nullptr || exec->getCancelled()) { + throw runtime_error(context + " query was cancelled"); + } string const noClientData; vector const noClientHeaders; http::ClientConfig clientConfig; @@ -139,10 +143,12 @@ std::tuple readHttpFileAndMergeHttp( bool last = false; char const* next = inBuf; char const* const end = inBuf + inBufSize; + LOGS(_log, LOG_LVL_INFO, + context << " next=" << (uint64_t)next << " end=" << (uint64_t)end); // &&& DEBUG while ((next < end) && !last) { - LOGS(_log, LOG_LVL_WARN, - context << "TODO:UJ next=" << (uint64_t)next << " end=" << (uint64_t)end - << " last=" << last); + if (exec->getCancelled()) { + throw runtime_error(context + " query was cancelled"); + } if (msgSizeBytes == 0) { // Continue or finish reading the frame header. size_t const bytes2read = @@ -210,15 +216,15 @@ std::tuple readHttpFileAndMergeHttp( msgSizeBytes = 0; } else { LOGS(_log, LOG_LVL_WARN, - context << " headerCount=" << headerCount - << " incomplete read diff=" << (msgSizeBytes - msgBufNext)); + context << " headerCount=" << headerCount << " incomplete read diff=" + << (msgSizeBytes - msgBufNext)); // &&& DEBUG } } } }); - LOGS(_log, LOG_LVL_DEBUG, + LOGS(_log, LOG_LVL_WARN, context << " headerCount=" << headerCount << " msgSizeBytes=" << msgSizeBytes - << " totalBytesRead=" << totalBytesRead); + << " totalBytesRead=" << totalBytesRead); // &&& if (msgSizeBufNext != 0) { throw runtime_error("short read of the message header at offset " + to_string(offset - msgSizeBytes) + ", file: " + httpUrl); @@ -366,7 +372,7 @@ tuple MergingHandler::flushHttp(string const& fileUrl, uint64_t expe } if (success) { - _infileMerger->mergeCompleteFor(uberJob->getJobId()); + _infileMerger->mergeCompleteFor(uberJob->getUjId()); } return {success, shouldCancel}; } diff --git a/src/ccontrol/MergingHandler.h b/src/ccontrol/MergingHandler.h index aa4e06dd0..6868abb16 100644 --- a/src/ccontrol/MergingHandler.h +++ b/src/ccontrol/MergingHandler.h @@ -98,6 +98,7 @@ class MergingHandler : public qdisp::ResponseHandler { /// Prepare for first call to flush(). void _initState(); + // &&& delete bool _merge(proto::ResponseSummary const& responseSummary, proto::ResponseData const& responseData, std::shared_ptr const& jobQuery); diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 02ac3cebd..186359768 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -156,16 +156,16 @@ std::string UserQuerySelect::getError() const { /// Attempt to kill in progress. void UserQuerySelect::kill() { - LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect kill"); + LOGS(_log, LOG_LVL_INFO, "UserQuerySelect KILL"); std::lock_guard lock(_killMutex); if (!_killed) { _killed = true; - int64_t collectedRows = _executive->getTotalResultRows(); + auto exec = _executive; + int64_t collectedRows = (exec) ? exec->getTotalResultRows() : -1; size_t collectedBytes = _infileMerger->getTotalResultSize(); try { // make a copy of executive pointer to keep it alive and avoid race // with pointer being reset in discard() method - std::shared_ptr exec = _executive; if (exec != nullptr) { exec->squash(); } @@ -233,6 +233,11 @@ std::string UserQuerySelect::getResultQuery() const { /// Begin running on all chunks added so far. void UserQuerySelect::submit() { + auto exec = _executive; + if (exec == nullptr) { + LOGS(_log, LOG_LVL_ERROR, "UserQuerySelect::submit() executive is null at start"); + return; + } _qSession->finalize(); // Using the QuerySession, generate query specs (text, db, chunkId) and then @@ -259,13 +264,13 @@ void UserQuerySelect::submit() { LOGS(_log, LOG_LVL_WARN, "Failed queryStatsTmpRegister " << e.what()); } - _executive->setScanInteractive(_qSession->getScanInteractive()); - _executive->setScanInfo(_qSession->getScanInfo()); + exec->setScanInteractive(_qSession->getScanInteractive()); + exec->setScanInfo(_qSession->getScanInfo()); string dbName(""); bool dbNameSet = false; - for (auto i = _qSession->cQueryBegin(), e = _qSession->cQueryEnd(); i != e && !_executive->getCancelled(); + for (auto i = _qSession->cQueryBegin(), e = _qSession->cQueryEnd(); i != e && !exec->getCancelled(); ++i) { auto& chunkSpec = *i; @@ -297,9 +302,9 @@ void UserQuerySelect::submit() { ResourceUnit ru; ru.setAsDbChunk(cs->db, cs->chunkId); qdisp::JobDescription::Ptr jobDesc = qdisp::JobDescription::create( - _qMetaCzarId, _executive->getId(), sequence, ru, + _qMetaCzarId, exec->getId(), sequence, ru, std::make_shared(_infileMerger, chunkResultName), cs, chunkResultName); - auto job = _executive->add(jobDesc); + auto job = exec->add(jobDesc); ++sequence; } @@ -309,12 +314,12 @@ void UserQuerySelect::submit() { /// At this point the executive has a map of all jobs with the chunkIds as the key. // This is needed to prevent Czar::_monitor from starting things before they are ready. - _executive->setReadyToExecute(); + exec->setReadyToExecute(); buildAndSendUberJobs(); LOGS(_log, LOG_LVL_DEBUG, "total jobs in query=" << sequence); // TODO:UJ Waiting for all jobs to start may not be needed anymore? - _executive->waitForAllJobsToStart(); + exec->waitForAllJobsToStart(); // we only care about per-chunk info for ASYNC queries if (_async) { @@ -331,18 +336,23 @@ void UserQuerySelect::buildAndSendUberJobs() { LOGS(_log, LOG_LVL_DEBUG, funcN << " start " << _uberJobMaxChunks); // Ensure `_monitor()` doesn't do anything until everything is ready. - if (!_executive->isReadyToExecute()) { + auto exec = _executive; + if (exec == nullptr) { + LOGS(_log, LOG_LVL_ERROR, funcN << " called with null exec " << getQueryIdString()); + return; + } + if (!exec->isReadyToExecute()) { LOGS(_log, LOG_LVL_INFO, funcN << " executive isn't ready to generate UberJobs."); return; } // Only one thread should be generating UberJobs for this user query at any given time. lock_guard fcLock(_buildUberJobMtx); - LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << _executive->getTotalJobs()); + LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << exec->getTotalJobs()); vector uberJobs; - qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = _executive->unassignedChunksInQuery(); + qdisp::Executive::ChunkIdJobMapType unassignedChunksInQuery = exec->unassignedChunksInQuery(); if (unassignedChunksInQuery.empty()) { LOGS(_log, LOG_LVL_DEBUG, funcN << " no unassigned Jobs"); return; @@ -397,9 +407,8 @@ void UserQuerySelect::buildAndSendUberJobs() { // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, // and should minimize the time for the first UberJob on the worker to complete. for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) { - bool const increaseAttemptCount = true; - jqPtr->getDescription()->incrAttemptCount(_executive, increaseAttemptCount); + jqPtr->getDescription()->incrAttemptCount(exec, increaseAttemptCount); // If too many workers are down, there will be a chunk that cannot be found. // Just continuing should leave jobs `unassigned` with their attempt count @@ -407,9 +416,7 @@ void UserQuerySelect::buildAndSendUberJobs() { // attempt count will reach max and the query will be cancelled auto lambdaMissingChunk = [&](string const& msg) { missingChunks.push_back(chunkId); - //&&&bool const increaseAttemptCount = true; - //&&&jqPtr->getDescription()->incrAttemptCountScrubResultsJson(_executive, increaseAttemptCount); - LOGS(_log, LOG_LVL_ERROR, msg); + LOGS(_log, LOG_LVL_WARN, msg); }; auto iter = chunkMapPtr->find(chunkId); @@ -463,8 +470,8 @@ void UserQuerySelect::buildAndSendUberJobs() { auto ujId = _uberJobIdSeq++; // keep ujId consistent string uberResultName = _ttn->make(ujId); auto respHandler = make_shared(_infileMerger, uberResultName); - auto uJob = qdisp::UberJob::create(_executive, respHandler, _executive->getId(), ujId, - _qMetaCzarId, targetWorker); + auto uJob = qdisp::UberJob::create(exec, respHandler, exec->getId(), ujId, _qMetaCzarId, + targetWorker); uJob->setWorkerContactInfo(wInfUJ->wInf); wInfUJ->uberJobPtr = uJob; }; @@ -473,7 +480,7 @@ void UserQuerySelect::buildAndSendUberJobs() { if (wInfUJ->uberJobPtr->getJobCount() >= _uberJobMaxChunks) { // Queue the UberJob to be sent to a worker - _executive->addAndQueueUberJob(wInfUJ->uberJobPtr); + exec->addAndQueueUberJob(wInfUJ->uberJobPtr); // Clear the pinter so a new UberJob is created later if needed. wInfUJ->uberJobPtr = nullptr; @@ -498,18 +505,23 @@ void UserQuerySelect::buildAndSendUberJobs() { if (winfUjPtr != nullptr) { auto& ujPtr = winfUjPtr->uberJobPtr; if (ujPtr != nullptr) { - _executive->addAndQueueUberJob(ujPtr); + exec->addAndQueueUberJob(ujPtr); } } } - LOGS(_log, LOG_LVL_DEBUG, funcN << " " << _executive->dumpUberJobCounts()); + LOGS(_log, LOG_LVL_DEBUG, funcN << " " << exec->dumpUberJobCounts()); } /// Block until a submit()'ed query completes. /// @return the QueryState indicating success or failure QueryState UserQuerySelect::join() { - bool successful = _executive->join(); // Wait for all data + auto exec = _executive; + if (exec == nullptr) { + LOGS(_log, LOG_LVL_ERROR, "UserQuerySelect::join() called with null exec " << getQueryIdString()); + return ERROR; + } + bool successful = exec->join(); // Wait for all data // Since all data are in, run final SQL commands like GROUP BY. size_t collectedBytes = 0; int64_t finalRows = 0; @@ -520,7 +532,7 @@ QueryState UserQuerySelect::join() { _messageStore->addMessage(-1, "MERGE", 1105, "Failure while merging result", MessageSeverity::MSG_ERROR); } - _executive->updateProxyMessages(); + exec->updateProxyMessages(); try { _discardMerger(); @@ -533,7 +545,7 @@ QueryState UserQuerySelect::join() { // Update the permanent message table. _qMetaUpdateMessages(); - int64_t collectedRows = _executive->getTotalResultRows(); + int64_t collectedRows = exec->getTotalResultRows(); // finalRows < 0 indicates there was no postprocessing, so collected rows and final rows should be the // same. if (finalRows < 0) finalRows = collectedRows; @@ -555,7 +567,7 @@ QueryState UserQuerySelect::join() { // Notify workers on the query completion/cancellation to ensure // resources are properly cleaned over there as well. - czar::Czar::getCzar()->getActiveWorkerMap()->addToDoneDeleteFiles(_executive->getId()); + czar::Czar::getCzar()->getActiveWorkerMap()->addToDoneDeleteFiles(exec->getId()); return state; } @@ -577,8 +589,14 @@ void UserQuerySelect::discard() { } } + auto exec = _executive; + if (exec == nullptr) { + LOGS(_log, LOG_LVL_ERROR, "UserQuerySelect::discard called with null exec " << getQueryIdString()); + return; + } + // Make sure resources are released. - if (_executive && _executive->getNumInflight() > 0) { + if (exec->getNumInflight() > 0) { throw UserQueryError(getQueryIdString() + " Executive unfinished, cannot discard"); } @@ -777,8 +795,9 @@ void UserQuerySelect::qMetaRegister(std::string const& resultLocation, std::stri throw UserQueryError(getQueryIdString() + _errorExtra); } - if (_executive != nullptr) { - _executive->setQueryId(_qMetaQueryId); + auto exec = _executive; + if (exec != nullptr) { + exec->setQueryId(_qMetaQueryId); } else { LOGS(_log, LOG_LVL_WARN, "No Executive, assuming invalid query"); } diff --git a/src/czar/Czar.cc b/src/czar/Czar.cc index 0ec244993..3061b4f7e 100644 --- a/src/czar/Czar.cc +++ b/src/czar/Czar.cc @@ -159,7 +159,7 @@ Czar::Czar(string const& configFilePath, string const& czarName) _idCounter(), _uqFactory(), _clientToQuery(), - _monitorSleepTime (_czarConfig->getMonitorSleepTimeMilliSec()), + _monitorSleepTime(_czarConfig->getMonitorSleepTimeMilliSec()), _activeWorkerMap(new ActiveWorkerMap(_czarConfig)) { // set id counter to milliseconds since the epoch, mod 1 year. struct timeval tv; @@ -402,45 +402,45 @@ void Czar::killQuery(string const& query, string const& clientId) { int threadId; QueryId queryId; if (ccontrol::UserQueryType::isKill(query, threadId)) { - LOGS(_log, LOG_LVL_DEBUG, "thread ID: " << threadId); + LOGS(_log, LOG_LVL_INFO, "KILL thread ID: " << threadId); lock_guard lock(_mutex); // find it in the client map based in client/thread id ClientThreadId ctId(clientId, threadId); auto iter = _clientToQuery.find(ctId); if (iter == _clientToQuery.end()) { - LOGS(_log, LOG_LVL_INFO, "Cannot find client thread id: " << threadId); - throw std::runtime_error("Unknown thread ID: " + query); + LOGS(_log, LOG_LVL_INFO, "KILL Cannot find client thread id: " << threadId); + throw std::runtime_error("KILL Unknown thread ID: " + query); } uq = iter->second.lock(); } else if (ccontrol::UserQueryType::isCancel(query, queryId)) { - LOGS(_log, LOG_LVL_DEBUG, "query ID: " << queryId); + LOGS(_log, LOG_LVL_INFO, "KILL query ID: " << queryId); lock_guard lock(_mutex); // find it in the client map based in client/thread id auto iter = _idToQuery.find(queryId); if (iter == _idToQuery.end()) { - LOGS(_log, LOG_LVL_INFO, "Cannot find query id: " << queryId); - throw std::runtime_error("Unknown or finished query ID: " + query); + LOGS(_log, LOG_LVL_INFO, "KILL Cannot find query id: " << queryId); + throw std::runtime_error("KILL unknown or finished query ID: " + query); } uq = iter->second.lock(); } else { - throw std::runtime_error("Failed to parse query: " + query); + throw std::runtime_error("KILL failed to parse query: " + query); } // assume this cannot fail or throw if (uq) { - LOGS(_log, LOG_LVL_DEBUG, "Killing query: " << uq->getQueryId()); + LOGS(_log, LOG_LVL_INFO, "KILLing query: " << uq->getQueryId()); // query killing can potentially take very long and we do now want to block // proxy from serving other requests so run it in a detached thread thread killThread([uq]() { uq->kill(); - LOGS(_log, LOG_LVL_DEBUG, "Finished killing query: " << uq->getQueryId()); + LOGS(_log, LOG_LVL_INFO, "Finished KILLing query: " << uq->getQueryId()); }); killThread.detach(); } else { - LOGS(_log, LOG_LVL_DEBUG, "Query has expired/finished: " << query); - throw std::runtime_error("Query has already finished: " + query); + LOGS(_log, LOG_LVL_INFO, "KILL query has expired/finished: " << query); + throw std::runtime_error("KILL query has already finished: " + query); } } diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 82d8fd1e8..6df9936c9 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -333,7 +333,7 @@ bool CzarFamilyMap::_read() { LOGS(_log, LOG_LVL_TRACE, "CzarFamilyMap::_read() start"); // If replacing the map, this may take a bit of time, but it's probably // better to wait for new maps if something changed. - std::lock_guard gLock(_familyMapMtx); // &&& check waiting is really needed + std::lock_guard gLock(_familyMapMtx); // &&& check waiting is really needed qmeta::QMetaChunkMap qChunkMap = _qmeta->getChunkMap(_lastUpdateTime); if (_lastUpdateTime == qChunkMap.updateTime) { LOGS(_log, LOG_LVL_DEBUG, diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index 266fdbdbe..f0a05388b 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -163,6 +163,7 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { throw invalid_argument(string("HttpCzarWorkerModule::_handleJobReady No executive for qid=") + to_string(queryId) + " czar=" + to_string(czarId)); } + qdisp::UberJob::Ptr uj = exec->findUberJob(uberJobId); if (uj == nullptr) { throw invalid_argument(string("HttpCzarWorkerModule::_handleJobReady No UberJob for qid=") + @@ -170,6 +171,9 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { " czar=" + to_string(czarId)); } + uj->setResultFileSize(fileSize); + exec->checkResultFileSize(fileSize); + auto importRes = uj->importResultFile(fileUrl, rowCount, fileSize); jsRet = importRes; diff --git a/src/protojson/UberJobMsg.cc b/src/protojson/UberJobMsg.cc index 65564cdf4..7ac1a89ad 100644 --- a/src/protojson/UberJobMsg.cc +++ b/src/protojson/UberJobMsg.cc @@ -60,8 +60,8 @@ UberJobMsg::UberJobMsg(unsigned int metaVersion, std::string const& replicationI _ujId(ujId), _rowLimit(rowLimit), _maxTableSizeMB(maxTableSizeMB), - _scanInfo(scanInfo_) { - + _scanInfo(scanInfo_), + _idStr("QID=" + to_string(_qId) + "_ujId=" + to_string(_ujId)) { for (auto& jobPtr : jobs) { // This creates the JobMsg objects for all relates jobs and their fragments. auto jobMsg = JobMsg::create(jobPtr, _jobSubQueryTempMap, _jobDbTablesMap); @@ -192,7 +192,7 @@ nlohmann::json JobMsg::serializeJson() const { {"queryFragments", json::array()}}); // These are indexes into _jobDbTablesMap, which is shared between all JobMsg in this UberJobMsg. - // &&& TODO:UJ queries appear to work even when "chunkscantables_indexes" is wrong + // &&& TODO:UJ "chunkscantables_indexes" may be unused. auto& jsqCstIndexes = jsJobMsg["chunkscantables_indexes"]; for (auto const& index : _chunkScanTableIndexes) { jsqCstIndexes.push_back(index); diff --git a/src/protojson/UberJobMsg.h b/src/protojson/UberJobMsg.h index d5f6ade9e..c06a3735d 100644 --- a/src/protojson/UberJobMsg.h +++ b/src/protojson/UberJobMsg.h @@ -222,15 +222,16 @@ class JobMsg { JobId _jobId; int _attemptCount; - std::string _chunkQuerySpecDb; - int _scanRating; - bool _scanInteractive; + std::string _chunkQuerySpecDb; // &&& remove, use value for UJ + int _scanRating; // &&& remove, use value for UJ + bool _scanInteractive; // &&& remove, use value for UJ int _chunkId; JobFragment::VectPtr _jobFragments{new JobFragment::Vect()}; JobSubQueryTempMap::Ptr _jobSubQueryTempMap; ///< Map of all query templates related to this UberJob. JobDbTablesMap::Ptr _jobDbTablesMap; ///< Map of all db.tables related to this UberJob. + // &&& remove, use value for UJ std::vector _chunkScanTableIndexes; ///< list of indexes into _jobDbTablesMap. }; @@ -277,6 +278,8 @@ class UberJobMsg : public std::enable_shared_from_this { ScanInfo::Ptr getScanInfo() const { return _scanInfo; } + std::string const& getIdStr() const { return _idStr; } + private: UberJobMsg(unsigned int metaVersion, std::string const& replicationInstanceId, std::string const& replicationAuthKey, CzarContactInfo::Ptr const& czInfo, @@ -304,6 +307,8 @@ class UberJobMsg : public std::enable_shared_from_this { JobMsg::VectPtr _jobMsgVect{new JobMsg::Vect()}; ScanInfo::Ptr _scanInfo{ScanInfo::create()}; ///< &&& doc + + std::string const _idStr; }; } // namespace lsst::qserv::protojson diff --git a/src/qana/QueryMapping.h b/src/qana/QueryMapping.h index 585971f97..2e8dca319 100644 --- a/src/qana/QueryMapping.h +++ b/src/qana/QueryMapping.h @@ -92,8 +92,6 @@ class QueryMapping { bool hasParameter(Parameter p) const; DbTableSet const& getSubChunkTables() const { return _subChunkTables; } - std::string dump() const { return std::string("&&& NEED CODE"); } - private: ParameterMap _subs; DbTableSet _subChunkTables; diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 983a0bf94..41754bc00 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -235,27 +235,11 @@ void Executive::queueFileCollect(util::PriorityCommand::Ptr const& cmd) { } } -/* &&& -void Executive::queueUberJob(std::shared_ptr const& uberJob) { - LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&&uj queueUberJob"); - auto runUberJobFunc = [uberJob](util::CmdData*) { uberJob->runUberJob(); }; - - auto cmd = util::PriorityCommand::Ptr(new util::PriorityCommand(runUberJobFunc)); - _jobStartCmdList.push_back(cmd); - if (_scanInteractive) { - _qdispPool->queCmd(cmd, 0); - } else { - _qdispPool->queCmd(cmd, 1); - } -} -*/ - void Executive::addAndQueueUberJob(shared_ptr const& uj) { { lock_guard lck(_uberJobsMapMtx); - UberJobId ujId = uj->getJobId(); + UberJobId ujId = uj->getUjId(); _uberJobsMap[ujId] = uj; - //&&&uj->setAdded(); LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uj->getJobCount()); } @@ -427,7 +411,8 @@ void Executive::markCompleted(JobId jobId, bool success) { } _unTrack(jobId); if (!success && !isRowLimitComplete()) { - LOGS(_log, LOG_LVL_ERROR, + auto logLvl = (_cancelled) ? LOG_LVL_ERROR : LOG_LVL_TRACE; + LOGS(_log, logLvl, "Executive: requesting squash, cause: " << " failed (code=" << err.getCode() << " " << err.getMsg() << ")"); squash(); // ask to squash @@ -758,6 +743,39 @@ void Executive::checkLimitRowComplete() { _squashSuperfluous(); } +void Executive::checkResultFileSize(uint64_t fileSize) { + _totalResultFileSize += fileSize; + if (_cancelled) return; + + size_t const MB_SIZE_BYTES = 1024 * 1024; + uint64_t maxResultTableSizeBytes = cconfig::CzarConfig::instance()->getMaxTableSizeMB() * MB_SIZE_BYTES; + LOGS(_log, LOG_LVL_TRACE, + cName(__func__) << " sz=" << fileSize << " total=" << _totalResultFileSize + << " max=" << maxResultTableSizeBytes); + if (_totalResultFileSize > maxResultTableSizeBytes) { + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << " total=" << _totalResultFileSize << " max=" << maxResultTableSizeBytes); + // _totalResultFileSize may include non zero values from dead UberJobs, + // so recalculate it to verify. + uint64_t total = 0; + { + lock_guard lck(_uberJobsMapMtx); + for (auto const& [ujId, ujPtr] : _uberJobsMap) { + total += ujPtr->getResultFileSize(); + } + _totalResultFileSize = total; + } + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << "recheck total=" << total << " max=" << maxResultTableSizeBytes); + if (total > maxResultTableSizeBytes) { + LOGS(_log, LOG_LVL_ERROR, "Executive: requesting squash, result file size too large " << total); + ResponseHandler::Error err(0, string("Incomplete result already too large ") + to_string(total)); + _multiError.push_back(err); + squash(); + } + } +} + ostream& operator<<(ostream& os, Executive::JobMap::value_type const& v) { auto const& status = v.second->getStatus(); os << v.first << ": " << *status; diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index c2cef1a34..e72216474 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -232,6 +232,13 @@ class Executive : public std::enable_shared_from_this { /// Return a pointer to _scanInfo. protojson::ScanInfo::Ptr getScanInfo() { return _scanInfo; } + /// Add fileSize to `_totalResultFileSize` and check if it exceeds limits. + /// If it is too large, check the value against existing UberJob result + /// sizes as `_totalResultFileSize` may include failed UberJobs. + /// If the sum of all UberJob result files size is too large, + /// cancel this user query. + void checkResultFileSize(uint64_t fileSize = 0); + protected: Executive(ExecutiveConfig const& cfg, std::shared_ptr const& ms, std::shared_ptr const& sharedResources, @@ -343,6 +350,8 @@ class Executive : public std::enable_shared_from_this { std::atomic _readyToExecute{false}; protojson::ScanInfo::Ptr _scanInfo; ///< Scan rating and tables. + + std::atomic _totalResultFileSize{0}; ///< Total size of all UberJob result files. }; } // namespace qdisp diff --git a/src/qdisp/JobDescription.h b/src/qdisp/JobDescription.h index 75ca4a33b..9ad0ffe62 100644 --- a/src/qdisp/JobDescription.h +++ b/src/qdisp/JobDescription.h @@ -63,8 +63,8 @@ class JobDescription { std::shared_ptr const& respHandler, std::shared_ptr const& chunkQuerySpec, std::string const& chunkResultName, bool mock = false) { - JobDescription::Ptr jd(new JobDescription(czarId, qId, jobId, resource, respHandler, - chunkQuerySpec, chunkResultName, mock)); + JobDescription::Ptr jd(new JobDescription(czarId, qId, jobId, resource, respHandler, chunkQuerySpec, + chunkResultName, mock)); return jd; } diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 10f535ff1..07ccd6875 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -77,8 +77,10 @@ UberJob::UberJob(Executive::Ptr const& executive, std::shared_ptrsetUberJobId(getJobId())) { + if (job->setUberJobId(getUjId())) { lock_guard lck(_jobsMtx); _jobs.push_back(job); success = true; @@ -167,10 +169,9 @@ void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelle bool transmitSuccess = false; string exceptionWhat; try { - //&&&util::InstanceCount ic{"runUberJob&&&"}; - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj d"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj sending"); json const response = client.readAsJson(); - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj d1"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj worker recv"); if (0 != response.at("success").get()) { transmitSuccess = true; } else { @@ -206,6 +207,7 @@ void UberJob::prepScrubResults() { } void UberJob::_unassignJobs() { + LOGS(_log, LOG_LVL_INFO, cName(__func__)); lock_guard lck(_jobsMtx); auto exec = _executive.lock(); if (exec == nullptr) { @@ -214,7 +216,7 @@ void UberJob::_unassignJobs() { } for (auto&& job : _jobs) { string jid = job->getIdStr(); - if (!job->unassignFromUberJob(getJobId())) { + if (!job->unassignFromUberJob(getUjId())) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " could not unassign job=" << jid << " cancelling"); exec->addMultiError(qmeta::JobStatus::RETRY_ERROR, "unable to re-assign " + jid, util::ErrorCode::INTERNAL); @@ -265,6 +267,7 @@ bool UberJob::_setStatusIfOk(qmeta::JobStatus::State newState, string const& msg void UberJob::callMarkCompleteFunc(bool success) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " success=" << success); + LOGS(_log, LOG_LVL_WARN, cName(__func__) << " &&& success=" << success); lock_guard lck(_jobsMtx); // Need to set this uberJob's status, however exec->markCompleted will set @@ -287,11 +290,16 @@ void UberJob::callMarkCompleteFunc(bool success) { _jobs.clear(); } +util::HistogramRolling histoQueImp("&&&uj histoQueImp", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); + /// Retrieve and process a result file using the file-based protocol /// Uses a copy of JobQuery::Ptr instead of _jobQuery as a call to cancel() would reset _jobQuery. json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_t fileSize) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " fileUrl=" << fileUrl << " rowCount=" << rowCount << " fileSize=" << fileSize); + LOGS(_log, LOG_LVL_WARN, + cName(__func__) << "&&& fileUrl=" << fileUrl << " rowCount=" << rowCount + << " fileSize=" << fileSize); if (isQueryCancelled()) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " import job was cancelled."); @@ -313,7 +321,7 @@ json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_ return _importResultError(false, "rowLimited", "Enough rows already"); } - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " fileSize=" << fileSize); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " fileSize=" << fileSize); bool const statusSet = setStatusIfOk(qmeta::JobStatus::RESPONSE_READY, getIdStr() + " " + fileUrl); if (!statusSet) { @@ -322,10 +330,15 @@ json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_ } weak_ptr ujThis = weak_from_this(); - // TODO:UJ lambda may not be the best way to do this, alsocheck synchronization - may need a mutex for - // merging. + auto startQImp = CLOCK::now(); // &&& + + // fileCollectFunc will be put on the queue to run later. string const idStr = _idStr; - auto fileCollectFunc = [ujThis, fileUrl, rowCount, idStr](util::CmdData*) { + auto fileCollectFunc = [ujThis, fileUrl, rowCount, idStr, startQImp](util::CmdData*) { + auto endQImp = CLOCK::now(); //&&& + std::chrono::duration secsQImp = endQImp - startQImp; // &&& + histoQueImp.addEntry(endQImp, secsQImp.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoQueImp.getString("")); auto ujPtr = ujThis.lock(); if (ujPtr == nullptr) { LOGS(_log, LOG_LVL_DEBUG, @@ -435,6 +448,7 @@ json UberJob::_importResultError(bool shouldCancel, string const& errorType, str void UberJob::_importResultFinish(uint64_t resultRows) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " &&& start"); auto exec = _executive.lock(); if (exec == nullptr) { diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index ce719d50d..c1ead8b24 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -71,9 +71,7 @@ class UberJob : public std::enable_shared_from_this { void killUberJob(); QueryId getQueryId() const { return _queryId; } - UberJobId getJobId() const { - return _uberJobId; - } // &&& TODO:UJ change name when JobBase no longer needed. + UberJobId getUjId() const { return _uberJobId; } std::string const& getIdStr() const { return _idStr; } std::shared_ptr getRespHandler() { return _respHandler; } std::shared_ptr getStatus() { return _jobStatus; } @@ -107,12 +105,15 @@ class UberJob : public std::enable_shared_from_this { /// Get the data for the worker that should handle this UberJob. czar::CzarChunkMap::WorkerChunksData::Ptr getWorkerData() { return _workerData; } - /// Collect and merge the results from the worker. + /// Queue the lambda function to collect and merge the results from the worker. nlohmann::json importResultFile(std::string const& fileUrl, uint64_t rowCount, uint64_t fileSize); /// Handle an error from the worker. nlohmann::json workerError(int errorCode, std::string const& errorMsg); + void setResultFileSize(uint64_t fileSize) { _resultFileSize = fileSize; } + uint64_t getResultFileSize() { return _resultFileSize; } + std::ostream& dumpOS(std::ostream& os) const; std::string dump() const; friend std::ostream& operator<<(std::ostream& os, UberJob const& uj); @@ -160,6 +161,7 @@ class UberJob : public std::enable_shared_from_this { UberJobId const _uberJobId; qmeta::CzarId const _czarId; int const _rowLimit; + uint64_t _resultFileSize = 0; std::string const _idStr; diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index bda0a020f..59299d1c5 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -182,8 +182,8 @@ qdisp::JobDescription::Ptr makeMockJobDescription(qdisp::Executive::Ptr const& e auto cqs = std::make_shared(); // dummy, unused in this case. std::string chunkResultName = "dummyResultTableName"; qmeta::CzarId const czarId = 1; - auto job = qdisp::JobDescription::create(czarId, ex->getId(), sequence, ru, mHandler, - cqs, chunkResultName, true); + auto job = qdisp::JobDescription::create(czarId, ex->getId(), sequence, ru, mHandler, cqs, + chunkResultName, true); return job; } diff --git a/src/qproc/ChunkQuerySpec.h b/src/qproc/ChunkQuerySpec.h index 41582368f..d7ad75984 100644 --- a/src/qproc/ChunkQuerySpec.h +++ b/src/qproc/ChunkQuerySpec.h @@ -67,8 +67,7 @@ class ChunkQuerySpec { bool scanInteractive{false}; DbTableSet subChunkTables; std::vector subChunkIds; - std::vector queries; // &&& remove if possible - std::vector queryTemplates; + std::vector queries; // Consider promoting the concept of container of ChunkQuerySpec // in the hopes of increased code cleanliness. std::shared_ptr nextFragment; ///< ad-hoc linked list (consider removal) diff --git a/src/rproc/InfileMerger.cc b/src/rproc/InfileMerger.cc index bf0f88d7c..b192f6c0f 100644 --- a/src/rproc/InfileMerger.cc +++ b/src/rproc/InfileMerger.cc @@ -332,8 +332,13 @@ bool InfileMerger::merge(proto::ResponseSummary const& responseSummary, return ret; } +uint32_t histLimitCount = 0; +util::HistogramRolling histoInfileBuild("&&&uj histoInfileBuild", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); +util::HistogramRolling histoMergeSecs("&&&uj histoMergeSecs", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); +util::HistogramRolling histoMergeSzB("&&&uj histoMergeSzB", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); + bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::ResponseData const& responseData) { - UberJobId const uJobId = uberJob->getJobId(); + UberJobId const uJobId = uberJob->getUjId(); std::string queryIdJobStr = uberJob->getIdStr(); if (!_queryIdStrSet) { _setQueryIdStr(QueryIdHelper::makeIdStr(uberJob->getQueryId())); @@ -372,13 +377,17 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response // Add columns to rows in virtFile. util::Timer virtFileT; virtFileT.start(); + auto startInfileBuild = CLOCK::now(); //&&& // UberJobs only get one attempt - int resultJobId = makeJobIdAttempt(uberJob->getJobId(), 0); + int resultJobId = makeJobIdAttempt(uberJob->getUjId(), 0); ProtoRowBuffer::Ptr pRowBuffer = std::make_shared( responseData, resultJobId, _jobIdColName, _jobIdSqlType, _jobIdMysqlType); std::string const virtFile = _infileMgr.prepareSrc(pRowBuffer); std::string const infileStatement = sql::formLoadInfile(_mergeTable, virtFile); virtFileT.stop(); + auto endInfileBuild = CLOCK::now(); //&&& + std::chrono::duration secsInfileBuild = endInfileBuild - startInfileBuild; // &&& + histoInfileBuild.addEntry(endInfileBuild, secsInfileBuild.count()); //&&& // If the job attempt is invalid, exit without adding rows. // It will wait here if rows need to be deleted. @@ -416,7 +425,8 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response return true; } - auto start = std::chrono::system_clock::now(); + //&&&auto start = std::chrono::system_clock::now(); + auto start = CLOCK::now(); switch (_dbEngine) { case MYISAM: ret = _applyMysqlMyIsam(infileStatement, resultSize); @@ -428,11 +438,20 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response default: throw std::invalid_argument("InfileMerger::_dbEngine is unknown =" + engineToStr(_dbEngine)); } - auto end = std::chrono::system_clock::now(); + auto end = CLOCK::now(); auto mergeDur = std::chrono::duration_cast(end - start); LOGS(_log, LOG_LVL_DEBUG, "mergeDur=" << mergeDur.count() << " sema(total=" << _semaMgrConn->getTotalCount() << " used=" << _semaMgrConn->getUsedCount() << ")"); + std::chrono::duration secs = end - start; // &&& + histoMergeSecs.addEntry(end, secs.count()); //&&& + histoMergeSzB.addEntry(end, resultSize); // &&& + if ((++histLimitCount) % 1000 == 0) { + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoInfileBuild.getString("")); + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoMergeSecs.getString("")); + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoMergeSzB.getString("")); + } + if (not ret) { LOGS(_log, LOG_LVL_ERROR, "InfileMerger::merge mysql applyMysql failure"); } diff --git a/src/rproc/InfileMerger.h b/src/rproc/InfileMerger.h index d8e472c54..3091246ca 100644 --- a/src/rproc/InfileMerger.h +++ b/src/rproc/InfileMerger.h @@ -165,6 +165,7 @@ class InfileMerger { /// Merge a worker response, which contains a single ResponseData message /// Using job query info for early termination of the merge if needed. /// @return true if merge was successfully imported. + // &&& delete bool merge(proto::ResponseSummary const& responseSummary, proto::ResponseData const& responseData, std::shared_ptr const& jq); diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index 030163d60..f51052a1b 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -455,12 +455,15 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptrcName(__func__) << " sending start"); //&&& TRACE if (!_sendResponse(tMtxLockA, task, cancelled, multiErr, rowLimitComplete)) { LOGS(_log, LOG_LVL_ERROR, "Could not transmit the request completion message to Czar."); erred = true; break; } - LOGS(_log, LOG_LVL_TRACE, __func__ << " " << task->getIdStr() << " sending done!!!"); + LOGS(_log, LOG_LVL_WARN, + "FileChannelShared " << task->cName(__func__) << " sending done!!!"); //&&& TRACE } } transmitT.stop(); diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index ca2dbf8e7..33b24f39e 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -144,7 +144,9 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun _scanInteractive(scanInteractive), _queryStats(queryStats_), _maxTableSize(maxTableSize * ::MB_SIZE_BYTES), - _rowLimit(ujData->getRowLimit()) { + _rowLimit(ujData->getRowLimit()), + _ujData(ujData), + _idStr(ujData->getIdStr() + " jId=" + to_string(_jId) + " sc=" + to_string(_subchunkId)) { // These attributes will be passed back to Czar in the Protobuf response // to advice which result delivery channel to use. auto const workerConfig = wconfig::WorkerConfig::instance(); @@ -191,118 +193,13 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun } _dbTblsAndSubchunks = make_unique(dbTbls_, subchunksVect_); + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " created"); //&&& } Task::~Task() {} -/* &&& -std::vector Task::createTasksForChunk( - std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, - protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, - std::shared_ptr const& chunkResourceMgr, mysql::MySqlConfig const& mySqlConfig, - std::shared_ptr const& sqlConnMgr, - std::shared_ptr const& queriesAndChunks, uint16_t resultsHttpPort) { - QueryId qId = ujData->getQueryId(); - UberJobId ujId = ujData->getUberJobId(); - CzarIdType czId = ujData->getCzarId(); - - wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); - UserQueryInfo::Ptr userQueryInfo = queryStats->getUserQueryInfo(); - - string funcN(__func__); - funcN += " QID=" + to_string(qId) + " "; - - vector vect; - for (auto const& job : jsJobs) { - json const& jsJobDesc = job["jobdesc"]; - http::RequestBodyJSON rbJobDesc(jsJobDesc); - // See qproc::TaskMsgFactory::makeMsgJson for message construction. - auto const jdCzarId = rbJobDesc.required("czarId"); - auto const jdQueryId = rbJobDesc.required("queryId"); - if (jdQueryId != qId) { - throw TaskException(ERR_LOC, string("ujId=") + to_string(ujId) + " qId=" + to_string(qId) + - " QueryId mismatch Job qId=" + to_string(jdQueryId)); - } - auto const jdJobId = rbJobDesc.required("jobId"); - auto const jdAttemptCount = rbJobDesc.required("attemptCount"); - auto const jdQuerySpecDb = rbJobDesc.required("querySpecDb"); - auto const jdScanPriority = rbJobDesc.required("scanPriority"); - auto const jdScanInteractive = rbJobDesc.required("scanInteractive"); - auto const jdMaxTableSizeMb = rbJobDesc.required("maxTableSize"); - auto const jdChunkId = rbJobDesc.required("chunkId"); - LOGS(_log, LOG_LVL_TRACE, - funcN << " jd cid=" << jdCzarId << " jdQId=" << jdQueryId << " jdJobId=" << jdJobId - << " jdAtt=" << jdAttemptCount << " jdQDb=" << jdQuerySpecDb - << " jdScanPri=" << jdScanPriority << " interactive=" << jdScanInteractive - << " maxTblSz=" << jdMaxTableSizeMb << " chunkId=" << jdChunkId); - - auto const jdQueryFragments = rbJobDesc.required("queryFragments"); - int fragmentNumber = 0; - for (auto const& frag : jdQueryFragments) { - vector fragSubQueries; - vector fragSubchunkIds; - vector fragSubTables; - LOGS(_log, LOG_LVL_DEBUG, funcN << " frag=" << frag); - http::RequestBodyJSON rbFrag(frag); - auto const& jsQueries = rbFrag.required("queries"); - // TODO:UJ move to uberjob???, these should be the same for all jobs - for (auto const& subQ : jsQueries) { - http::RequestBodyJSON rbSubQ(subQ); - auto const subQuery = rbSubQ.required("subQuery"); - LOGS(_log, LOG_LVL_DEBUG, funcN << " subQuery=" << subQuery); - fragSubQueries.push_back(subQuery); - } - auto const& resultTable = rbFrag.required("resultTable"); - auto const& jsSubIds = rbFrag.required("subchunkIds"); - for (auto const& scId : jsSubIds) { - fragSubchunkIds.push_back(scId); - } - auto const& jsSubTables = rbFrag.required("subchunkTables"); - - for (auto const& scDbTable : jsSubTables) { // TODO:UJ are these the same for all jobs? - http::RequestBodyJSON rbScDbTable(scDbTable); - string scDb = rbScDbTable.required("scDb"); - string scTable = rbScDbTable.required("scTable"); - TaskDbTbl scDbTbl(scDb, scTable); - fragSubTables.push_back(scDbTbl); - } - - for (string const& fragSubQ : fragSubQueries) { - size_t templateId = userQueryInfo->addTemplate(fragSubQ); - if (fragSubchunkIds.empty()) { - bool const noSubchunks = false; - int const subchunkId = -1; - auto task = Task::Ptr(new Task( - ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, templateId, - noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, - fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); - - vect.push_back(task); - } else { - for (auto subchunkId : fragSubchunkIds) { - bool const hasSubchunks = true; - auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, - fragmentNumber, templateId, hasSubchunks, subchunkId, - jdQuerySpecDb, scanInfo, scanInteractive, - maxTableSizeMb, fragSubTables, fragSubchunkIds, - sendChannel, queryStats, resultsHttpPort)); - vect.push_back(task); - } - } - } - ++fragmentNumber; - } - } - - for (auto taskPtr : vect) { - // newQueryRunner sets the `_taskQueryRunner` pointer in `task`. - taskPtr->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(taskPtr, chunkResourceMgr, mySqlConfig, - sqlConnMgr, queriesAndChunks)); - } - return vect; -} -*/ +util::HistogramRolling histoBuildTasks("&&&uj histoBuildTasks", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); +util::HistogramRolling histoTaskCount("&&&uj histoTasksCount", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); std::vector Task::createTasksFromUberJobMsg( std::shared_ptr const& ujMsg, std::shared_ptr const& ujData, @@ -314,6 +211,7 @@ std::vector Task::createTasksFromUberJobMsg( UberJobId ujId = ujData->getUberJobId(); CzarIdType czId = ujData->getCzarId(); + auto startBuildTasks = CLOCK::now(); vector vect; // List of created tasks to be returned. wpublish::QueryStatistics::Ptr queryStats = queriesAndChunks->addQueryId(qId, czId); @@ -403,6 +301,14 @@ std::vector Task::createTasksFromUberJobMsg( taskPtr->setTaskQueryRunner(wdb::QueryRunner::newQueryRunner(taskPtr, chunkResourceMgr, mySqlConfig, sqlConnMgr, queriesAndChunks)); } + + auto endBuildTasks = CLOCK::now(); //&&& + std::chrono::duration secsBuildTasks = endBuildTasks - startBuildTasks; // &&& + histoBuildTasks.addEntry(endBuildTasks, secsBuildTasks.count()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoBuildTasks.getString("")); + histoTaskCount.addEntry(endBuildTasks, vect.size()); //&&& + LOGS(_log, LOG_LVL_INFO, "&&&uj histo " << histoTaskCount.getString("")); + return vect; } diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 0711cfe9f..9f9d30b88 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -149,7 +149,7 @@ class Task : public util::CommandForThreadPool { bool operator()(Ptr const& x, Ptr const& y); }; - std::string cName(const char* func) const { return std::string("Task::") + func; } + std::string cName(const char* func) const { return std::string("Task::") + func + " " + _idStr; } // TODO:UJ too many parameters. // - fragmentNumber seems pointless @@ -168,18 +168,6 @@ class Task : public util::CommandForThreadPool { Task(const Task&) = delete; virtual ~Task(); -/* &&& - /// Read json to generate a vector of one or more task for a chunk. - static std::vector createTasksForChunk( /// &&& delete - std::shared_ptr const& ujData, nlohmann::json const& jsJobs, - std::shared_ptr const& sendChannel, - protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, - std::shared_ptr const& chunkResourceMgr, - mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& sqlConnMgr, - std::shared_ptr const& queriesAndChunks, - uint16_t resultsHttpPort = 8080); -*/ - /// &&& static std::vector createTasksFromUberJobMsg( std::shared_ptr const& uberJobMsg, @@ -195,12 +183,7 @@ class Task : public util::CommandForThreadPool { std::shared_ptr const& ujData, nlohmann::json const& jsJobs, std::shared_ptr const& sendChannel, protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, - std::shared_ptr const& chunkResourceMgr - //&&&mysql::MySqlConfig const& mySqlConfig, std::shared_ptr const& - // sqlConnMgr, - //&&&std::shared_ptr const& queriesAndChunks, - //&&&uint16_t resultsHttpPort = 8080); - ); + std::shared_ptr const& chunkResourceMgr); std::shared_ptr getSendChannel() const { return _sendChannel; } void resetSendChannel() { _sendChannel.reset(); } ///< reset the shared pointer for FileChannelShared @@ -408,6 +391,9 @@ class Task : public util::CommandForThreadPool { /// When > 0, indicates maximum number of rows needed for a result. int const _rowLimit; + std::shared_ptr _ujData; + std::string const _idStr; + bool _unitTest = false; ///< }; diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index a70793f2a..b782e645a 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -68,7 +68,7 @@ UberJobData::UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta _workerId(workerId), _authKey(authKey), _foreman(foreman), - _idStr(string("QID=") + to_string(_queryId) + ":ujId=" + to_string(_uberJobId)) {} + _idStr(string("QID=") + to_string(_queryId) + "_ujId=" + to_string(_uberJobId)) {} void UberJobData::setFileChannelShared(std::shared_ptr const& fileChannelShared) { if (_fileChannelShared != nullptr && _fileChannelShared != fileChannelShared) { @@ -79,7 +79,8 @@ void UberJobData::setFileChannelShared(std::shared_ptr const& void UberJobData::responseFileReady(string const& httpFileUrl, uint64_t rowCount, uint64_t fileSize, uint64_t headerCount) { - LOGS(_log, LOG_LVL_TRACE, + //&&&LOGS(_log, LOG_LVL_TRACE, + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " httpFileUrl=" << httpFileUrl << " rows=" << rowCount << " fSize=" << fileSize << " headerCount=" << headerCount); @@ -152,6 +153,7 @@ bool UberJobData::responseError(util::MultiError& multiErr, std::shared_ptr const& headers_, std::string const& url_, std::string const& requestContext_, std::string const& requestStr_) { + LOGS(_log, LOG_LVL_INFO, cName(__func__)); // &&& util::QdispPool::Ptr wPool; if (_foreman != nullptr) { wPool = _foreman->getWPool(); @@ -183,11 +185,12 @@ void UberJobData::cancelAllTasks() { string UJTransmitCmd::cName(const char* funcN) const { stringstream os; - os << "UJTransmitCmd::" << funcN << " czId=" << _czarId << " qId=" << _queryId << " ujId=" << _uberJobId; + os << "UJTransmitCmd::" << funcN << " czId=" << _czarId << " QID=" << _queryId << "_ujId=" << _uberJobId; return os.str(); } void UJTransmitCmd::action(util::CmdData* data) { + LOGS(_log, LOG_LVL_INFO, cName(__func__)); //&&& // Make certain _selfPtr is reset before leaving this function. // If a retry is needed, duplicate() is called. class ResetSelf { @@ -218,6 +221,7 @@ void UJTransmitCmd::action(util::CmdData* data) { } catch (exception const& ex) { LOGS(_log, LOG_LVL_WARN, cName(__func__) + " " + _requestContext + " failed, ex: " + ex.what()); } + LOGS(_log, LOG_LVL_INFO, cName(__func__) << " &&& transmit finished"); if (!transmitSuccess) { auto sPtr = _selfPtr; @@ -256,8 +260,8 @@ void UJTransmitCmd::action(util::CmdData* data) { } void UJTransmitCmd::kill() { - string const funcN("UJTransmitCmd::kill"); - LOGS(_log, LOG_LVL_WARN, funcN); + //&&&string const funcN("UJTransmitCmd::kill"); + LOGS(_log, LOG_LVL_WARN, cName(__func__)); auto sPtr = _selfPtr; _selfPtr.reset(); if (sPtr == nullptr) { @@ -266,6 +270,7 @@ void UJTransmitCmd::kill() { } UJTransmitCmd::Ptr UJTransmitCmd::duplicate() { + LOGS(_log, LOG_LVL_INFO, cName(__func__)); //&&& auto ujD = _ujData.lock(); if (ujD == nullptr) { return nullptr; diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index d4765fbbe..2634e0325 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -96,7 +96,7 @@ class UberJobData : public std::enable_shared_from_this { /// Let the Czar know there's been a problem. bool responseError(util::MultiError& multiErr, std::shared_ptr const& task, bool cancelled); - std::string getIdStr() const { return _idStr; } + std::string const& getIdStr() const { return _idStr; } std::string cName(std::string const& funcName) { return "UberJobData::" + funcName + " " + getIdStr(); } bool getCancelled() const { return _cancelled; } diff --git a/src/wdb/QueryRunner.cc b/src/wdb/QueryRunner.cc index 8fdd5194c..5774de042 100644 --- a/src/wdb/QueryRunner.cc +++ b/src/wdb/QueryRunner.cc @@ -136,8 +136,9 @@ util::TimerHistogram memWaitHisto("memWait Hist", {1, 5, 10, 20, 40}); bool QueryRunner::runQuery() { util::HoldTrack::Mark runQueryMarkA(ERR_LOC, "runQuery " + to_string(_task->getQueryId())); QSERV_LOGCONTEXT_QUERY_JOB(_task->getQueryId(), _task->getJobId()); - LOGS(_log, LOG_LVL_TRACE, - __func__ << " tid=" << _task->getIdStr() << " scsId=" << _task->getSendChannel()->getScsId()); + LOGS(_log, LOG_LVL_WARN, + "QueryRunner " << _task->cName(__func__) //&&& TRACE + << " scsId=" << _task->getSendChannel()->getScsId()); // Start tracking the task. auto now = chrono::system_clock::now(); @@ -261,7 +262,9 @@ bool QueryRunner::_dispatchChannel() { util::Timer primeT; primeT.start(); _task->queryExecutionStarted(); + LOGS(_log, LOG_LVL_WARN, "QueryRunner " << _task->cName(__func__) << " sql start"); //&&& TRACE MYSQL_RES* res = _primeResult(query); // This runs the SQL query, throws SqlErrorObj on failure. + LOGS(_log, LOG_LVL_WARN, "QueryRunner " << _task->cName(__func__) << " sql end"); //&&& TRACE primeT.stop(); needToFreeRes = true; if (taskSched != nullptr) { diff --git a/src/wdb/QueryRunner.h b/src/wdb/QueryRunner.h index a881075f0..639a8f569 100644 --- a/src/wdb/QueryRunner.h +++ b/src/wdb/QueryRunner.h @@ -55,7 +55,8 @@ class QueriesAndChunks; namespace lsst::qserv::wdb { -/// On the worker, run a query related to a Task, writing the results to a table or supplied SendChannel. +/// On the worker, run a query related to a Task, hold the resources needed to run the query, +/// and write the results to the supplied SendChannel. /// class QueryRunner : public wbase::TaskQueryRunner, public std::enable_shared_from_this { public: diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index c67acf74a..de9aebaab 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -167,9 +167,8 @@ BOOST_AUTO_TEST_CASE(Simple) { auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); - vector taskVect = - Task::createTasksForUnitTest(ujData, *msgJson, sChannel, scanInfo, mInfo.scanInteractive, - mInfo.maxTableSize, crm); + vector taskVect = Task::createTasksForUnitTest(ujData, *msgJson, sChannel, scanInfo, + mInfo.scanInteractive, mInfo.maxTableSize, crm); Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); @@ -193,9 +192,8 @@ BOOST_AUTO_TEST_CASE(Output) { auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); - vector taskVect = - Task::createTasksForUnitTest(ujData, *msgJson, sc, scanInfo, mInfo.scanInteractive, - mInfo.maxTableSize, crm); + vector taskVect = Task::createTasksForUnitTest(ujData, *msgJson, sc, scanInfo, + mInfo.scanInteractive, mInfo.maxTableSize, crm); Task::Ptr task = taskVect[0]; QueryRunner::Ptr a(QueryRunner::newQueryRunner(task, crm, newMySqlConfig(), sqlConnMgr, queries)); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index 8a4aa910b..a672b740a 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -112,6 +112,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { try { auto const& jsReq = body().objJson; auto uberJobMsg = protojson::UberJobMsg::createFromJson(jsReq); + LOGS(_log, LOG_LVL_WARN, uberJobMsg->getIdStr() << " &&& parsed msg"); UberJobId ujId = uberJobMsg->getUberJobId(); auto ujCzInfo = uberJobMsg->getCzarContactInfo(); @@ -123,6 +124,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { // Get or create QueryStatistics and UserQueryInfo instances. auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzInfo->czId); auto userQueryInfo = queryStats->getUserQueryInfo(); + LOGS(_log, LOG_LVL_WARN, uberJobMsg->getIdStr() << " &&& added to stats"); if (userQueryInfo->getCancelledByCzar()) { throw wbase::TaskException( @@ -136,6 +138,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { auto ujData = wbase::UberJobData::create(ujId, ujCzInfo->czName, ujCzInfo->czId, ujCzInfo->czHostName, ujCzInfo->czPort, ujQueryId, ujRowLimit, targetWorkerId, foreman(), authKey()); + LOGS(_log, LOG_LVL_WARN, uberJobMsg->getIdStr() << " &&& ujData created"); // Find the entry for this queryId, create a new one if needed. userQueryInfo->addUberJob(ujData); @@ -150,8 +153,7 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { channelShared->setTaskCount(ujTasks.size()); ujData->addTasks(ujTasks); - // At this point, it looks like the message was sent successfully, update - // czar touched time. + // At this point, it looks like the message was sent successfully. wcontrol::WCzarInfoMap::Ptr wCzarMap = foreman()->getWCzarInfoMap(); wcontrol::WCzarInfo::Ptr wCzarInfo = wCzarMap->getWCzarInfo(czarId); wCzarInfo->czarMsgReceived(CLOCK::now()); From 055702f8a69de455cf889abc88db722c00cc6907 Mon Sep 17 00:00:00 2001 From: John Gates Date: Thu, 9 Jan 2025 14:39:06 -0800 Subject: [PATCH 22/22] Improved Job creation performance. --- src/ccontrol/MergingHandler.cc | 23 +----- src/ccontrol/MergingHandler.h | 7 -- src/ccontrol/UserQuerySelect.cc | 13 ++-- src/czar/CzarChunkMap.cc | 2 +- src/czar/HttpCzarWorkerModule.cc | 5 +- src/qdisp/Executive.cc | 35 +++++---- src/qdisp/Executive.h | 16 ++-- src/qdisp/JobDescription.cc | 16 ++-- src/qdisp/JobQuery.cc | 3 +- src/qdisp/UberJob.cc | 19 ++--- src/qdisp/UberJob.h | 2 + src/qdisp/testQDisp.cc | 9 ++- src/qproc/QuerySession.cc | 1 - src/rproc/InfileMerger.cc | 116 +---------------------------- src/rproc/InfileMerger.h | 7 -- src/util/Error.h | 1 + src/wbase/FileChannelShared.cc | 21 +++--- src/wbase/FileChannelShared.h | 2 + src/wbase/Task.cc | 22 +++--- src/wbase/Task.h | 8 +- src/wbase/UberJobData.cc | 5 +- src/wbase/UberJobData.h | 13 ++-- src/wdb/testQueryRunner.cc | 4 +- src/xrdsvc/HttpWorkerCzarModule.cc | 10 ++- 24 files changed, 122 insertions(+), 238 deletions(-) diff --git a/src/ccontrol/MergingHandler.cc b/src/ccontrol/MergingHandler.cc index aaa940049..637ff1b32 100644 --- a/src/ccontrol/MergingHandler.cc +++ b/src/ccontrol/MergingHandler.cc @@ -269,9 +269,7 @@ shared_ptr const& MergingHandler::_getHttpConnPool() { } MergingHandler::MergingHandler(std::shared_ptr merger, std::string const& tableName) - : _infileMerger{merger}, _tableName{tableName} { - _initState(); -} + : _infileMerger{merger}, _tableName{tableName} {} MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_DEBUG, __func__ << " " << _tableName); } @@ -293,23 +291,6 @@ std::ostream& MergingHandler::print(std::ostream& os) const { return os << "MergingRequester(" << _tableName << ", flushed=" << (_flushed ? "true)" : "false)"); } -void MergingHandler::_initState() { _setError(0, ""); } - -bool MergingHandler::_merge(proto::ResponseSummary const& responseSummary, - proto::ResponseData const& responseData, - shared_ptr const& jobQuery) { - if (_flushed) { - throw util::Bug(ERR_LOC, "already flushed"); - } - bool success = _infileMerger->merge(responseSummary, responseData, jobQuery); - if (!success) { - LOGS(_log, LOG_LVL_WARN, __func__ << " failed"); - util::Error const& err = _infileMerger->getError(); - _setError(ccontrol::MSG_RESULT_ERROR, err.getMsg()); - } - return success; -} - bool MergingHandler::_mergeHttp(shared_ptr const& uberJob, proto::ResponseData const& responseData) { if (_flushed) { @@ -325,7 +306,7 @@ bool MergingHandler::_mergeHttp(shared_ptr const& uberJob, } void MergingHandler::_setError(int code, std::string const& msg) { - LOGS(_log, LOG_LVL_DEBUG, "_setErr: code: " << code << ", message: " << msg); + LOGS(_log, LOG_LVL_DEBUG, "_setError: code: " << code << ", message: " << msg); std::lock_guard lock(_errorMutex); _error = Error(code, msg); } diff --git a/src/ccontrol/MergingHandler.h b/src/ccontrol/MergingHandler.h index 6868abb16..a34a547ae 100644 --- a/src/ccontrol/MergingHandler.h +++ b/src/ccontrol/MergingHandler.h @@ -95,13 +95,6 @@ class MergingHandler : public qdisp::ResponseHandler { void prepScrubResults(int jobId, int attempt) override; private: - /// Prepare for first call to flush(). - void _initState(); - - // &&& delete - bool _merge(proto::ResponseSummary const& responseSummary, proto::ResponseData const& responseData, - std::shared_ptr const& jobQuery); - /// Call InfileMerger to do the work of merging this data to the result. bool _mergeHttp(std::shared_ptr const& uberJob, proto::ResponseData const& responseData); diff --git a/src/ccontrol/UserQuerySelect.cc b/src/ccontrol/UserQuerySelect.cc index 186359768..46264c210 100644 --- a/src/ccontrol/UserQuerySelect.cc +++ b/src/ccontrol/UserQuerySelect.cc @@ -167,7 +167,7 @@ void UserQuerySelect::kill() { // make a copy of executive pointer to keep it alive and avoid race // with pointer being reset in discard() method if (exec != nullptr) { - exec->squash(); + exec->squash("UserQuerySelect::kill"); } } catch (UserQueryError const& e) { // Silence merger discarding errors, because this object is being @@ -296,6 +296,7 @@ void UserQuerySelect::submit() { return; } dbName = cs->db; + _queryDbName = dbName; dbNameSet = true; } @@ -308,13 +309,9 @@ void UserQuerySelect::submit() { ++sequence; } - if (dbNameSet) { - _queryDbName = dbName; - } - /// At this point the executive has a map of all jobs with the chunkIds as the key. // This is needed to prevent Czar::_monitor from starting things before they are ready. - exec->setReadyToExecute(); + exec->setAllJobsCreated(); buildAndSendUberJobs(); LOGS(_log, LOG_LVL_DEBUG, "total jobs in query=" << sequence); @@ -341,7 +338,8 @@ void UserQuerySelect::buildAndSendUberJobs() { LOGS(_log, LOG_LVL_ERROR, funcN << " called with null exec " << getQueryIdString()); return; } - if (!exec->isReadyToExecute()) { + + if (!exec->isAllJobsCreated()) { LOGS(_log, LOG_LVL_INFO, funcN << " executive isn't ready to generate UberJobs."); return; } @@ -406,6 +404,7 @@ void UserQuerySelect::buildAndSendUberJobs() { // numerical order. The workers run shared scans in numerical order of chunkId numbers. // Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum, // and should minimize the time for the first UberJob on the worker to complete. + LOGS(_log, LOG_LVL_WARN, " &&&d " << funcN << " start assigning"); for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) { bool const increaseAttemptCount = true; jqPtr->getDescription()->incrAttemptCount(exec, increaseAttemptCount); diff --git a/src/czar/CzarChunkMap.cc b/src/czar/CzarChunkMap.cc index 6df9936c9..23c5aa816 100644 --- a/src/czar/CzarChunkMap.cc +++ b/src/czar/CzarChunkMap.cc @@ -428,7 +428,7 @@ void CzarFamilyMap::insertIntoMaps(std::shared_ptr const& newFami CzarChunkMap::SizeT sz) { // Get the CzarChunkMap for this family auto familyName = getFamilyNameFromDbName(dbName); - LOGS(_log, LOG_LVL_INFO, + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " familyInsrt{w=" << workerId << " fN=" << familyName << " dbN=" << dbName << " tblN=" << tableName << " chunk=" << chunkIdNum << " sz=" << sz << "}"); auto& nfMap = *newFamilyMap; diff --git a/src/czar/HttpCzarWorkerModule.cc b/src/czar/HttpCzarWorkerModule.cc index f0a05388b..a833e8f2b 100644 --- a/src/czar/HttpCzarWorkerModule.cc +++ b/src/czar/HttpCzarWorkerModule.cc @@ -99,12 +99,13 @@ json HttpCzarWorkerModule::_workerCzarComIssue() { json HttpCzarWorkerModule::_handleJobError(string const& func) { LOGS(_log, LOG_LVL_DEBUG, "HttpCzarWorkerModule::_handleJobError start"); + LOGS(_log, LOG_LVL_WARN, "&&& HttpCzarWorkerModule::_handleJobError start " << body().objJson); // Metadata-only responses for the file-based protocol should not have any data // Parse and verify the json message and then kill the UberJob. json jsRet = {{"success", 0}, {"errortype", "unknown"}, {"note", "initialized"}}; try { - // See qdisp::UberJob::runUberJob() for json message construction. &&& + // TODO:UJ see wbase::UberJobData::responseError for message construction string const targetWorkerId = body().required("workerid"); string const czarName = body().required("czar"); qmeta::CzarId const czarId = body().required("czarid"); @@ -147,7 +148,7 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) { try { // &&& TODO:UJ file response - move construction and parsing // &&& TODO:UJ to a class so it can be added to WorkerCzarComIssue - // See qdisp::UberJob::runUberJob() for json message construction. &&& + // See wbase::UberJobData::responseFileReady string const targetWorkerId = body().required("workerid"); string const czarName = body().required("czar"); qmeta::CzarId const czarId = body().required("czarid"); diff --git a/src/qdisp/Executive.cc b/src/qdisp/Executive.cc index 41754bc00..e414e986a 100644 --- a/src/qdisp/Executive.cc +++ b/src/qdisp/Executive.cc @@ -196,20 +196,22 @@ JobQuery::Ptr Executive::add(JobDescription::Ptr const& jobDesc) { QSERV_LOGCONTEXT_QUERY_JOB(jobQuery->getQueryId(), jobQuery->getJobId()); { - lock_guard lock(_cancelled.getMutex()); - if (_cancelled) { - LOGS(_log, LOG_LVL_DEBUG, - "Executive already cancelled, ignoring add(" << jobDesc->id() << ")"); - return nullptr; + { + lock_guard lock(_cancelled.getMutex()); + if (_cancelled) { + LOGS(_log, LOG_LVL_DEBUG, + "Executive already cancelled, ignoring add(" << jobDesc->id() << ")"); + return nullptr; + } } - if (!_addJobToMap(jobQuery)) { - LOGS(_log, LOG_LVL_ERROR, "Executive ignoring duplicate job add"); + if (!_track(jobQuery->getJobId(), jobQuery)) { + LOGS(_log, LOG_LVL_ERROR, "Executive ignoring duplicate track add"); return jobQuery; } - if (!_track(jobQuery->getJobId(), jobQuery)) { - LOGS(_log, LOG_LVL_ERROR, "Executive ignoring duplicate track add"); + if (!_addJobToMap(jobQuery)) { + LOGS(_log, LOG_LVL_ERROR, "Executive ignoring duplicate job add"); return jobQuery; } @@ -240,7 +242,7 @@ void Executive::addAndQueueUberJob(shared_ptr const& uj) { lock_guard lck(_uberJobsMapMtx); UberJobId ujId = uj->getUjId(); _uberJobsMap[ujId] = uj; - LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uj->getJobCount()); + LOGS(_log, LOG_LVL_TRACE, cName(__func__) << " ujId=" << ujId << " uj.sz=" << uj->getJobCount()); } auto runUberJobFunc = [uj](util::CmdData*) { uj->runUberJob(); }; @@ -415,18 +417,19 @@ void Executive::markCompleted(JobId jobId, bool success) { LOGS(_log, logLvl, "Executive: requesting squash, cause: " << " failed (code=" << err.getCode() << " " << err.getMsg() << ")"); - squash(); // ask to squash + squash(string("markComplete error ") + err.getMsg()); // ask to squash } } -void Executive::squash() { +void Executive::squash(string const& note) { bool alreadyCancelled = _cancelled.exchange(true); if (alreadyCancelled) { LOGS(_log, LOG_LVL_DEBUG, "Executive::squash() already cancelled! refusing. qid=" << getId()); return; } - LOGS(_log, LOG_LVL_INFO, "Executive::squash Trying to cancel all queries... qid=" << getId()); + LOGS(_log, LOG_LVL_WARN, + "Executive::squash Trying to cancel all queries... qid=" << getId() << " " << note); deque jobsToCancel; { lock_guard lockJobMap(_jobMapMtx); @@ -670,6 +673,7 @@ void Executive::_waitAllUntilEmpty() { int moreDetailThreshold = 10; int complainCount = 0; const chrono::seconds statePrintDelay(5); + // Loop until all jobs have completed and all jobs have been created. while (!_incompleteJobs.empty()) { count = _incompleteJobs.size(); if (count != lastCount) { @@ -769,9 +773,10 @@ void Executive::checkResultFileSize(uint64_t fileSize) { cName(__func__) << "recheck total=" << total << " max=" << maxResultTableSizeBytes); if (total > maxResultTableSizeBytes) { LOGS(_log, LOG_LVL_ERROR, "Executive: requesting squash, result file size too large " << total); - ResponseHandler::Error err(0, string("Incomplete result already too large ") + to_string(total)); + ResponseHandler::Error err(util::ErrorCode::CZAR_RESULT_TOO_LARGE, + string("Incomplete result already too large ") + to_string(total)); _multiError.push_back(err); - squash(); + squash("czar, file too large"); } } } diff --git a/src/qdisp/Executive.h b/src/qdisp/Executive.h index e72216474..48e64e3dd 100644 --- a/src/qdisp/Executive.h +++ b/src/qdisp/Executive.h @@ -147,7 +147,7 @@ class Executive : public std::enable_shared_from_this { void markCompleted(JobId refNum, bool success); /// Squash all the jobs. - void squash(); + void squash(std::string const& note); bool getEmpty() { return _empty; } @@ -210,13 +210,13 @@ class Executive : public std::enable_shared_from_this { // The below value should probably be based on the user query, with longer sleeps for slower queries. int getAttemptSleepSeconds() const { return 15; } // As above or until added to config file. - int getMaxAttempts() const { return 5; } // Should be set by config + int getMaxAttempts() const { return 50; } // TODO:UJ Should be set by config - /// Calling this indicates the executive is ready to create and execute UberJobs. - void setReadyToExecute() { _readyToExecute = true; } + /// Calling this indicates all Jobs for this user query have been created. + void setAllJobsCreated() { _allJobsCreated = true; } - /// Returns true if the executive is ready to create and execute UberJobs. - bool isReadyToExecute() { return _readyToExecute; } + /// Returns true if all jobs have been created. + bool isAllJobsCreated() { return _allJobsCreated; } /// Send a message to all workers to cancel this query. /// @param deleteResults - If true, delete all result files for this query on the workers. @@ -346,8 +346,8 @@ class Executive : public std::enable_shared_from_this { /// Weak pointer to the UserQuerySelect object for this query. std::weak_ptr _userQuerySelect; - /// Flag that is set to true when ready to create and run UberJobs. - std::atomic _readyToExecute{false}; + /// Flag that is set to true when all jobs have been created. + std::atomic _allJobsCreated{false}; protojson::ScanInfo::Ptr _scanInfo; ///< Scan rating and tables. diff --git a/src/qdisp/JobDescription.cc b/src/qdisp/JobDescription.cc index 660e57330..fdd29f3d9 100644 --- a/src/qdisp/JobDescription.cc +++ b/src/qdisp/JobDescription.cc @@ -65,14 +65,12 @@ bool JobDescription::incrAttemptCount(std::shared_ptr const& exec, bo if (increase) { ++_attemptCount; } - if (_attemptCount >= MAX_JOB_ATTEMPTS) { - LOGS(_log, LOG_LVL_ERROR, "attemptCount greater than maximum number of retries " << _attemptCount); - return false; - } if (exec != nullptr) { int maxAttempts = exec->getMaxAttempts(); - LOGS(_log, LOG_LVL_INFO, "JoQDescription::" << __func__ << " attempts=" << _attemptCount); + if (_attemptCount > 0) { + LOGS(_log, LOG_LVL_INFO, "JoBDescription::" << __func__ << " attempts=" << _attemptCount); + } if (_attemptCount > maxAttempts) { LOGS(_log, LOG_LVL_ERROR, "JoQDescription::" << __func__ << " attempts(" << _attemptCount << ") > maxAttempts(" @@ -80,10 +78,16 @@ bool JobDescription::incrAttemptCount(std::shared_ptr const& exec, bo exec->addMultiError(qmeta::JobStatus::RETRY_ERROR, "max attempts reached " + to_string(_attemptCount) + " " + _qIdStr, util::ErrorCode::INTERNAL); - exec->squash(); + exec->squash(string("incrAttemptCount ") + to_string(_attemptCount)); return false; } } + + if (_attemptCount >= MAX_JOB_ATTEMPTS) { + LOGS(_log, LOG_LVL_ERROR, "attemptCount greater than maximum number of retries " << _attemptCount); + return false; + } + return true; } diff --git a/src/qdisp/JobQuery.cc b/src/qdisp/JobQuery.cc index 71d9f19ec..b8f05034d 100644 --- a/src/qdisp/JobQuery.cc +++ b/src/qdisp/JobQuery.cc @@ -61,7 +61,8 @@ JobQuery::~JobQuery() { /// Cancel response handling. Return true if this is the first time cancel has been called. bool JobQuery::cancel(bool superfluous) { QSERV_LOGCONTEXT_QUERY_JOB(getQueryId(), getJobId()); - LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel()"); + LOGS(_log, LOG_LVL_DEBUG, "JobQuery::cancel() " << superfluous); + LOGS(_log, LOG_LVL_WARN, "&&&JobQuery::cancel() " << superfluous); if (_cancelled.exchange(true) == false) { VMUTEX_NOT_HELD(_jqMtx); lock_guard lock(_jqMtx); diff --git a/src/qdisp/UberJob.cc b/src/qdisp/UberJob.cc index 07ccd6875..00c4d11bd 100644 --- a/src/qdisp/UberJob.cc +++ b/src/qdisp/UberJob.cc @@ -106,7 +106,7 @@ util::HistogramRolling histoUJSerialize("&&&uj histoUJSerialize", {0.1, 1.0, 10. void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelled LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " start"); - LOGS(_log, LOG_LVL_ERROR, cName(__func__) << "&&&uj runuj start"); + LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " &&&uj runuj start"); // Build the uberjob payload for each job. nlohmann::json uj; unique_lock jobsLock(_jobsMtx); @@ -119,7 +119,7 @@ void UberJob::runUberJob() { // &&& TODO:UJ this should probably check cancelle vector const headers = {"Content-Type: application/json"}; auto const& czarConfig = cconfig::CzarConfig::instance(); - int maxTableSizeMB = czarConfig->getMaxTableSizeMB(); + uint64_t maxTableSizeMB = czarConfig->getMaxTableSizeMB(); auto czInfo = protojson::CzarContactInfo::create( czarConfig->name(), czarConfig->id(), czarConfig->replicationHttpPort(), util::get_current_host_fqdn(), czar::Czar::czarStartupTime); @@ -220,7 +220,7 @@ void UberJob::_unassignJobs() { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " could not unassign job=" << jid << " cancelling"); exec->addMultiError(qmeta::JobStatus::RETRY_ERROR, "unable to re-assign " + jid, util::ErrorCode::INTERNAL); - exec->squash(); + exec->squash("_unassignJobs failure"); return; } LOGS(_log, LOG_LVL_DEBUG, @@ -292,14 +292,9 @@ void UberJob::callMarkCompleteFunc(bool success) { util::HistogramRolling histoQueImp("&&&uj histoQueImp", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); -/// Retrieve and process a result file using the file-based protocol -/// Uses a copy of JobQuery::Ptr instead of _jobQuery as a call to cancel() would reset _jobQuery. json UberJob::importResultFile(string const& fileUrl, uint64_t rowCount, uint64_t fileSize) { LOGS(_log, LOG_LVL_DEBUG, cName(__func__) << " fileUrl=" << fileUrl << " rowCount=" << rowCount << " fileSize=" << fileSize); - LOGS(_log, LOG_LVL_WARN, - cName(__func__) << "&&& fileUrl=" << fileUrl << " rowCount=" << rowCount - << " fileSize=" << fileSize); if (isQueryCancelled()) { LOGS(_log, LOG_LVL_WARN, cName(__func__) << " import job was cancelled."); @@ -398,7 +393,7 @@ json UberJob::workerError(int errorCode, string const& errorMsg) { // TODO:UJ see if recoverable errors can be detected on the workers, or // maybe allow a single retry before sending the error back to the user? bool recoverableError = false; - recoverableError = true; // TODO:UJ delete after testing + if (recoverableError) { // The czar should have new maps before the the new UberJob(s) for // these Jobs are created. (see Czar::_monitor) @@ -408,7 +403,7 @@ json UberJob::workerError(int errorCode, string const& errorMsg) { int errState = util::ErrorCode::MYSQLEXEC; getRespHandler()->flushHttpError(errorCode, errorMsg, errState); exec->addMultiError(errorCode, errorMsg, errState); - exec->squash(); + exec->squash(string("UberJob::workerError ") + errorMsg); } string errType = to_string(errorCode) + ":" + errorMsg; @@ -427,7 +422,7 @@ json UberJob::_importResultError(bool shouldCancel, string const& errorType, str if (shouldCancel) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " failing jobs"); callMarkCompleteFunc(false); // all jobs failed, no retry - exec->squash(); + exec->squash(string("_importResultError shouldCancel")); } else { /// - each JobQuery in _jobs needs to be flagged as needing to be /// put in an UberJob and it's attempt count increased and checked @@ -465,7 +460,7 @@ void UberJob::_importResultFinish(uint64_t resultRows) { if (!statusSet) { LOGS(_log, LOG_LVL_ERROR, cName(__func__) << " failed to set status, squashing " << getIdStr()); // Something has gone very wrong - exec->squash(); + exec->squash("UberJob::_importResultFinish couldn't set status"); return; } diff --git a/src/qdisp/UberJob.h b/src/qdisp/UberJob.h index c1ead8b24..cc2a32a31 100644 --- a/src/qdisp/UberJob.h +++ b/src/qdisp/UberJob.h @@ -106,6 +106,8 @@ class UberJob : public std::enable_shared_from_this { czar::CzarChunkMap::WorkerChunksData::Ptr getWorkerData() { return _workerData; } /// Queue the lambda function to collect and merge the results from the worker. + /// @return a json message indicating success unless the query has been + /// cancelled, limit row complete, or similar. nlohmann::json importResultFile(std::string const& fileUrl, uint64_t rowCount, uint64_t fileSize); /// Handle an error from the worker. diff --git a/src/qdisp/testQDisp.cc b/src/qdisp/testQDisp.cc index 59299d1c5..deee865d0 100644 --- a/src/qdisp/testQDisp.cc +++ b/src/qdisp/testQDisp.cc @@ -204,6 +204,7 @@ std::shared_ptr addMockRequests(qdisp::Executive::Ptr const& ex qdisp::JobDescription::Ptr job = makeMockJobDescription(ex, sequence.incr(), ru, msg, rv[j]); jobQuery = ex->add(job); } + ex->setAllJobsCreated(); return jobQuery; } @@ -377,7 +378,7 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { // squash SequentialInt sequence(0); tEnv.jqTest = executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 1); - tEnv.ex->squash(); + tEnv.ex->squash("test"); usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); BOOST_CHECK(tEnv.jqTest->isQueryCancelled() == true); @@ -390,9 +391,9 @@ BOOST_AUTO_TEST_CASE(ExecutiveCancel) { // squash SequentialInt sequence(0); executiveTest(tEnv.ex, sequence, chunkId, tEnv.qrMsg, 20); - tEnv.ex->squash(); - tEnv.ex->squash(); // check that squashing twice doesn't cause issues. - usleep(250000); // Give mock threads a quarter second to complete. + tEnv.ex->squash("test"); + tEnv.ex->squash("test"); // check that squashing twice doesn't cause issues. + usleep(250000); // Give mock threads a quarter second to complete. tEnv.ex->join(); } } diff --git a/src/qproc/QuerySession.cc b/src/qproc/QuerySession.cc index 9bd643265..969409a4d 100644 --- a/src/qproc/QuerySession.cc +++ b/src/qproc/QuerySession.cc @@ -391,7 +391,6 @@ std::vector QuerySession::_buildChunkQueries(query::QueryTemplate:: } for (auto&& queryTemplate : queryTemplates) { - LOGS(_log, LOG_LVL_WARN, "&&&uj QuerySession::_buildChunkQueries qt=" << queryTemplate.dump()); std::string str = _context->queryMapping->apply(chunkSpec, queryTemplate); chunkQueries.push_back(std::move(str)); } diff --git a/src/rproc/InfileMerger.cc b/src/rproc/InfileMerger.cc index b192f6c0f..e44383b71 100644 --- a/src/rproc/InfileMerger.cc +++ b/src/rproc/InfileMerger.cc @@ -220,118 +220,6 @@ void InfileMerger::mergeCompleteFor(int jobId) { _totalResultSize += _perJobResultSize[jobId]; // TODO:UJ this can probably be simplified } -bool InfileMerger::merge(proto::ResponseSummary const& responseSummary, - proto::ResponseData const& responseData, - std::shared_ptr const& jq) { - JobId const jobId = responseSummary.jobid(); - std::string queryIdJobStr = QueryIdHelper::makeIdStr(responseSummary.queryid(), jobId); - if (!_queryIdStrSet) { - _setQueryIdStr(QueryIdHelper::makeIdStr(responseSummary.queryid())); - } - - // Nothing to do if size is zero. - if (responseData.row_size() == 0) { - return true; - } - - // Do nothing if the query got cancelled for any reason. - if (jq->isQueryCancelled()) { - return true; - } - auto executive = jq->getExecutive(); - if (executive == nullptr || executive->getCancelled() || executive->isRowLimitComplete()) { - return true; - } - - std::unique_ptr semaLock; - if (_dbEngine != MYISAM) { - // needed for parallel merging with INNODB and MEMORY - semaLock.reset(new util::SemaLock(*_semaMgrConn)); - } - - TimeCountTracker::CALLBACKFUNC cbf = [](TIMEPOINT start, TIMEPOINT end, double bytes, - bool success) { - if (!success) return; - if (std::chrono::duration const seconds = end - start; seconds.count() > 0) { - qdisp::CzarStats::get()->addXRootDSSIRecvRate(bytes / seconds.count()); - } - }; - auto tct = make_shared>(cbf); - - bool ret = false; - // Add columns to rows in virtFile. - util::Timer virtFileT; - virtFileT.start(); - int resultJobId = makeJobIdAttempt(responseSummary.jobid(), responseSummary.attemptcount()); - ProtoRowBuffer::Ptr pRowBuffer = std::make_shared( - responseData, resultJobId, _jobIdColName, _jobIdSqlType, _jobIdMysqlType); - std::string const virtFile = _infileMgr.prepareSrc(pRowBuffer); - std::string const infileStatement = sql::formLoadInfile(_mergeTable, virtFile); - virtFileT.stop(); - - // If the job attempt is invalid, exit without adding rows. - // It will wait here if rows need to be deleted. - if (_invalidJobAttemptMgr.incrConcurrentMergeCount(resultJobId)) { - return true; - } - - size_t const resultSize = responseData.transmitsize(); - size_t tResultSize; - { - std::lock_guard resultSzLock(_mtxResultSizeMtx); - _perJobResultSize[jobId] += resultSize; - tResultSize = _totalResultSize + _perJobResultSize[jobId]; - } - if (tResultSize > _maxResultTableSizeBytes) { - std::ostringstream os; - os << queryIdJobStr << " cancelling the query, queryResult table " << _mergeTable - << " is too large at " << tResultSize << " bytes, max allowed size is " << _maxResultTableSizeBytes - << " bytes"; - LOGS(_log, LOG_LVL_ERROR, os.str()); - _error = util::Error(-1, os.str(), -1); - return false; - } - - tct->addToValue(resultSize); - tct->setSuccess(); - tct.reset(); // stop transmit recieve timer before merging happens. - - qdisp::CzarStats::get()->addTotalBytesRecv(resultSize); - qdisp::CzarStats::get()->addTotalRowsRecv(responseData.rowcount()); - - // Stop here (if requested) after collecting stats on the amount of data collected - // from workers. - if (_config.debugNoMerge) { - return true; - } - - auto start = std::chrono::system_clock::now(); - switch (_dbEngine) { - case MYISAM: - ret = _applyMysqlMyIsam(infileStatement, resultSize); - break; - case INNODB: // Fallthrough - case MEMORY: - ret = _applyMysqlInnoDb(infileStatement, resultSize); - break; - default: - throw std::invalid_argument("InfileMerger::_dbEngine is unknown =" + engineToStr(_dbEngine)); - } - auto end = std::chrono::system_clock::now(); - auto mergeDur = std::chrono::duration_cast(end - start); - LOGS(_log, LOG_LVL_DEBUG, - "mergeDur=" << mergeDur.count() << " sema(total=" << _semaMgrConn->getTotalCount() - << " used=" << _semaMgrConn->getUsedCount() << ")"); - if (not ret) { - LOGS(_log, LOG_LVL_ERROR, "InfileMerger::merge mysql applyMysql failure"); - } - _invalidJobAttemptMgr.decrConcurrentMergeCount(); - - LOGS(_log, LOG_LVL_DEBUG, "virtFileT=" << virtFileT.getElapsed() << " mergeDur=" << mergeDur.count()); - - return ret; -} - uint32_t histLimitCount = 0; util::HistogramRolling histoInfileBuild("&&&uj histoInfileBuild", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); util::HistogramRolling histoMergeSecs("&&&uj histoMergeSecs", {0.1, 1.0, 10.0, 100.0, 1000.0}, 1h, 10000); @@ -440,7 +328,7 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response } auto end = CLOCK::now(); auto mergeDur = std::chrono::duration_cast(end - start); - LOGS(_log, LOG_LVL_DEBUG, + LOGS(_log, LOG_LVL_TRACE, "mergeDur=" << mergeDur.count() << " sema(total=" << _semaMgrConn->getTotalCount() << " used=" << _semaMgrConn->getUsedCount() << ")"); std::chrono::duration secs = end - start; // &&& @@ -457,7 +345,7 @@ bool InfileMerger::mergeHttp(qdisp::UberJob::Ptr const& uberJob, proto::Response } _invalidJobAttemptMgr.decrConcurrentMergeCount(); - LOGS(_log, LOG_LVL_DEBUG, "virtFileT=" << virtFileT.getElapsed() << " mergeDur=" << mergeDur.count()); + LOGS(_log, LOG_LVL_TRACE, "virtFileT=" << virtFileT.getElapsed() << " mergeDur=" << mergeDur.count()); return ret; } diff --git a/src/rproc/InfileMerger.h b/src/rproc/InfileMerger.h index 3091246ca..14ab9b395 100644 --- a/src/rproc/InfileMerger.h +++ b/src/rproc/InfileMerger.h @@ -162,13 +162,6 @@ class InfileMerger { std::string engineToStr(InfileMerger::DbEngine engine); - /// Merge a worker response, which contains a single ResponseData message - /// Using job query info for early termination of the merge if needed. - /// @return true if merge was successfully imported. - // &&& delete - bool merge(proto::ResponseSummary const& responseSummary, proto::ResponseData const& responseData, - std::shared_ptr const& jq); - /// Merge the result data collected over Http. bool mergeHttp(std::shared_ptr const& uberJob, proto::ResponseData const& responseData); diff --git a/src/util/Error.h b/src/util/Error.h index c95ec76b0..825594ce6 100644 --- a/src/util/Error.h +++ b/src/util/Error.h @@ -61,6 +61,7 @@ struct ErrorCode { MYSQLCONNECT, MYSQLEXEC, INTERNAL, + CZAR_RESULT_TOO_LARGE, // Worker errors: WORKER_RESULT_TOO_LARGE }; diff --git a/src/wbase/FileChannelShared.cc b/src/wbase/FileChannelShared.cc index f51052a1b..338771488 100644 --- a/src/wbase/FileChannelShared.cc +++ b/src/wbase/FileChannelShared.cc @@ -381,7 +381,7 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptrgetIdStr() << " bytesT=" << bytesTransmitted - << " _tsz=" << _transmitsize); + __func__ << " " << task->getIdStr() << " bytesT=" << _bytesWritten << " _tsz=" << _transmitsize); bufferFillT.stop(); bufferFillSecs += bufferFillT.getElapsed(); - int64_t const maxTableSize = task->getMaxTableSize(); + uint64_t const maxTableSize = task->getMaxTableSize(); // Fail the operation if the amount of data in the result set exceeds the requested // "large result" limit (in case one was specified). - if (maxTableSize > 0 && bytesTransmitted > maxTableSize) { - string const err = "The result set size " + to_string(bytesTransmitted) + + LOGS(_log, LOG_LVL_TRACE, "bytesWritten=" << _bytesWritten << " max=" << maxTableSize); + if (maxTableSize > 0 && _bytesWritten > maxTableSize) { + string const err = "The result set size " + to_string(_bytesWritten) + " of a job exceeds the requested limit of " + to_string(maxTableSize) + " bytes, task: " + task->getIdStr(); multiErr.push_back(util::Error(util::ErrorCode::WORKER_RESULT_TOO_LARGE, err)); LOGS(_log, LOG_LVL_ERROR, err); erred = true; - break; + //&&&task->cancel(); + //&&&buildAndTransmitError(multiErr, task, cancelled); + return erred; } int const ujRowLimit = task->getRowLimit(); @@ -472,7 +475,7 @@ bool FileChannelShared::buildAndTransmitResult(MYSQL_RES* mResult, shared_ptrgetIdStr()); } else { - qStats->addTaskTransmit(timeSeconds, bytesTransmitted, rowsTransmitted, bufferFillSecs); + qStats->addTaskTransmit(timeSeconds, taskBytesWritten, rowsTransmitted, bufferFillSecs); LOGS(_log, LOG_LVL_TRACE, "TaskTransmit time=" << timeSeconds << " bufferFillSecs=" << bufferFillSecs); } diff --git a/src/wbase/FileChannelShared.h b/src/wbase/FileChannelShared.h index 348eb3cb3..b1fb26a1a 100644 --- a/src/wbase/FileChannelShared.h +++ b/src/wbase/FileChannelShared.h @@ -305,6 +305,8 @@ class FileChannelShared { /// much faster to answer the query without scanning all 1000 chunks. std::atomic _rowLimitComplete; std::atomic _dead{false}; ///< Set to true when the contents of the file are no longer useful. + + std::atomic _bytesWritten{0}; ///< Total bytes written. }; } // namespace lsst::qserv::wbase diff --git a/src/wbase/Task.cc b/src/wbase/Task.cc index 33b24f39e..2fa6b3ce0 100644 --- a/src/wbase/Task.cc +++ b/src/wbase/Task.cc @@ -123,7 +123,7 @@ atomic taskSequence{0}; ///< Unique identifier source for Task. /// the util::CommandThreadPool is not called here. Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, string const& db, - protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSize, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, //&&& int maxTableSize, vector const& fragSubTables, vector const& fragSubchunkIds, shared_ptr const& sc, std::shared_ptr const& queryStats_, uint16_t resultsHttpPort) @@ -143,7 +143,7 @@ Task::Task(UberJobData::Ptr const& ujData, int jobId, int attemptCount, int chun _scanInfo(scanInfo), _scanInteractive(scanInteractive), _queryStats(queryStats_), - _maxTableSize(maxTableSize * ::MB_SIZE_BYTES), + //&&&_maxTableSize(maxTableSize * ::MB_SIZE_BYTES), _rowLimit(ujData->getRowLimit()), _ujData(ujData), _idStr(ujData->getIdStr() + " jId=" + to_string(_jId) + " sc=" + to_string(_subchunkId)) { @@ -235,7 +235,7 @@ std::vector Task::createTasksFromUberJobMsg( auto jobSubQueryTempMap = ujMsg->getJobSubQueryTempMap(); auto jobDbTablesMap = ujMsg->getJobDbTablesMap(); auto jobMsgVect = ujMsg->getJobMsgVect(); - int maxTableSizeMb = ujMsg->getMaxTableSizeMb(); + //&&& int maxTableSizeMb = ujMsg->getMaxTableSizeMb(); auto scanInfo = ujMsg->getScanInfo(); for (auto const& jobMsg : *jobMsgVect) { @@ -276,7 +276,7 @@ std::vector Task::createTasksFromUberJobMsg( int const subchunkId = -1; auto task = Task::Ptr(new Task( ujData, jobId, attemptCount, chunkId, fragmentNumber, templateId, noSubchunks, - subchunkId, chunkQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + subchunkId, chunkQuerySpecDb, scanInfo, scanInteractive, //&&& maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); vect.push_back(task); @@ -285,7 +285,7 @@ std::vector Task::createTasksFromUberJobMsg( bool const hasSubchunks = true; auto task = Task::Ptr(new Task(ujData, jobId, attemptCount, chunkId, fragmentNumber, templateId, hasSubchunks, subchunkId, chunkQuerySpecDb, - scanInfo, scanInteractive, maxTableSizeMb, + scanInfo, scanInteractive, //&&&maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, queryStats, resultsHttpPort)); vect.push_back(task); @@ -384,8 +384,9 @@ std::vector Task::createTasksForUnitTest( int const subchunkId = -1; auto task = Task::Ptr(new Task(ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, 0, noSubchunks, subchunkId, jdQuerySpecDb, scanInfo, - scanInteractive, maxTableSizeMb, fragSubTables, - fragSubchunkIds, sendChannel, nullptr, 0)); + //&&&scanInteractive, maxTableSizeMb, fragSubTables, + scanInteractive, fragSubTables, fragSubchunkIds, + sendChannel, nullptr, 0)); vect.push_back(task); } else { @@ -393,7 +394,7 @@ std::vector Task::createTasksForUnitTest( bool const hasSubchunks = true; auto task = Task::Ptr(new Task( ujData, jdJobId, jdAttemptCount, jdChunkId, fragmentNumber, 0, hasSubchunks, - subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, maxTableSizeMb, + subchunkId, jdQuerySpecDb, scanInfo, scanInteractive, //&&& maxTableSizeMb, fragSubTables, fragSubchunkIds, sendChannel, nullptr, 0)); vect.push_back(task); @@ -637,7 +638,8 @@ nlohmann::json Task::getJson() const { js["attemptId"] = _attemptCount; js["sequenceId"] = _tSeq; js["scanInteractive"] = _scanInteractive; - js["maxTableSize"] = _maxTableSize; + //&&&js["maxTableSize"] = _maxTableSize; + js["maxTableSize"] = _ujData->getMaxTableSizeBytes(); js["cancelled"] = to_string(_cancelled); js["state"] = static_cast(_state.load()); js["createTime_msec"] = util::TimeUtils::tp2ms(_createTime); @@ -655,6 +657,8 @@ nlohmann::json Task::getJson() const { return js; } +int64_t Task::getMaxTableSize() const { return _ujData->getMaxTableSizeBytes(); } + ostream& operator<<(ostream& os, Task const& t) { os << "Task: " << "msg: " << t.getIdStr() << " chunk=" << t._chunkId << " db=" << t._db << " " << t.getQueryString(); diff --git a/src/wbase/Task.h b/src/wbase/Task.h index 9f9d30b88..b6586f5d2 100644 --- a/src/wbase/Task.h +++ b/src/wbase/Task.h @@ -159,7 +159,7 @@ class Task : public util::CommandForThreadPool { // Unfortunately, this will be much easier if it is done after xrootd method is removed. Task(std::shared_ptr const& ujData, int jobId, int attemptCount, int chunkId, int fragmentNumber, size_t templateId, bool hasSubchunks, int subchunkId, std::string const& db, - protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, int maxTableSizeMb, + protojson::ScanInfo::Ptr const& scanInfo, bool scanInteractive, //&&&int maxTableSizeMb, std::vector const& fragSubTables, std::vector const& fragSubchunkIds, std::shared_ptr const& sc, std::shared_ptr const& queryStats_, uint16_t resultsHttpPort = 8080); @@ -237,7 +237,9 @@ class Task : public util::CommandForThreadPool { int getJobId() const { return _jId; } int getAttemptCount() const { return _attemptCount; } bool getScanInteractive() { return _scanInteractive; } - int64_t getMaxTableSize() const { return _maxTableSize; } + //&&&int64_t getMaxTableSize() const { return _maxTableSize; } + int64_t getMaxTableSize() const; + protojson::ScanInfo::Ptr getScanInfo() { return _scanInfo; } void setOnInteractive(bool val) { _onInteractive = val; } bool getOnInteractive() { return _onInteractive; } @@ -366,7 +368,7 @@ class Task : public util::CommandForThreadPool { /// Stores information on the query's resource usage. std::weak_ptr const _queryStats; - int64_t _maxTableSize = 0; + //&&&int64_t _maxTableSize = 0; std::atomic _memHandle{memman::MemMan::HandleType::INVALID}; memman::MemMan::Ptr _memMan; diff --git a/src/wbase/UberJobData.cc b/src/wbase/UberJobData.cc index b782e645a..5743354fb 100644 --- a/src/wbase/UberJobData.cc +++ b/src/wbase/UberJobData.cc @@ -56,8 +56,8 @@ namespace lsst::qserv::wbase { UberJobData::UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, std::string czarHost, int czarPort, uint64_t queryId, int rowLimit, - std::string const& workerId, std::shared_ptr const& foreman, - std::string const& authKey) + uint64_t maxTableSizeBytes, std::string const& workerId, + std::shared_ptr const& foreman, std::string const& authKey) : _uberJobId(uberJobId), _czarName(czarName), _czarId(czarId), @@ -65,6 +65,7 @@ UberJobData::UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta _czarPort(czarPort), _queryId(queryId), _rowLimit(rowLimit), + _maxTableSizeBytes(maxTableSizeBytes), _workerId(workerId), _authKey(authKey), _foreman(foreman), diff --git a/src/wbase/UberJobData.h b/src/wbase/UberJobData.h index 2634e0325..a16960311 100644 --- a/src/wbase/UberJobData.h +++ b/src/wbase/UberJobData.h @@ -66,10 +66,10 @@ class UberJobData : public std::enable_shared_from_this { static Ptr create(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, std::string const& czarHost, int czarPort, uint64_t queryId, int rowLimit, - std::string const& workerId, std::shared_ptr const& foreman, - std::string const& authKey) { + uint64_t maxTableSizeBytes, std::string const& workerId, + std::shared_ptr const& foreman, std::string const& authKey) { return Ptr(new UberJobData(uberJobId, czarName, czarId, czarHost, czarPort, queryId, rowLimit, - workerId, foreman, authKey)); + maxTableSizeBytes, workerId, foreman, authKey)); } /// Set file channel for this UberJob void setFileChannelShared(std::shared_ptr const& fileChannelShared); @@ -82,6 +82,7 @@ class UberJobData : public std::enable_shared_from_this { int getCzarPort() const { return _czarPort; } uint64_t getQueryId() const { return _queryId; } std::string getWorkerId() const { return _workerId; } + uint64_t getMaxTableSizeBytes() const { return _maxTableSizeBytes; } /// Add the tasks defined in the UberJob to this UberJobData object. void addTasks(std::vector> const& tasks) { @@ -112,8 +113,9 @@ class UberJobData : public std::enable_shared_from_this { private: UberJobData(UberJobId uberJobId, std::string const& czarName, qmeta::CzarId czarId, std::string czarHost, - int czarPort, uint64_t queryId, int rowLimit, std::string const& workerId, - std::shared_ptr const& foreman, std::string const& authKey); + int czarPort, uint64_t queryId, int rowLimit, uint64_t maxTableSizeBytes, + std::string const& workerId, std::shared_ptr const& foreman, + std::string const& authKey); /// Queue the response to be sent to the originating czar. void _queueUJResponse(http::Method method_, std::vector const& headers_, @@ -127,6 +129,7 @@ class UberJobData : public std::enable_shared_from_this { int const _czarPort; QueryId const _queryId; int const _rowLimit; ///< If > 0, only read this many rows before return the results. + uint64_t const _maxTableSizeBytes; std::string const _workerId; std::string const _authKey; diff --git a/src/wdb/testQueryRunner.cc b/src/wdb/testQueryRunner.cc index de9aebaab..5f7612dab 100644 --- a/src/wdb/testQueryRunner.cc +++ b/src/wdb/testQueryRunner.cc @@ -163,7 +163,7 @@ BOOST_AUTO_TEST_CASE(Simple) { auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, - mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + mInfo.rowLimit, mInfo.maxTableSize, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); @@ -188,7 +188,7 @@ BOOST_AUTO_TEST_CASE(Output) { auto const queries = queriesAndChunks(); auto ujData = lsst::qserv::wbase::UberJobData::create( mInfo.uberJobId, mInfo.czarName, mInfo.czarId, mInfo.czarHostName, mInfo.czarPort, mInfo.queryId, - mInfo.rowLimit, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); + mInfo.rowLimit, mInfo.maxTableSize, mInfo.targWorkerId, mInfo.foreman, mInfo.authKey); auto scanInfo = lsst::qserv::protojson::ScanInfo::create(); scanInfo->scanRating = mInfo.scanRating; scanInfo->infoTables.emplace_back(mInfo.db, mInfo.table, mInfo.lockInMemory, mInfo.scanRating); diff --git a/src/xrdsvc/HttpWorkerCzarModule.cc b/src/xrdsvc/HttpWorkerCzarModule.cc index a672b740a..0e915a673 100644 --- a/src/xrdsvc/HttpWorkerCzarModule.cc +++ b/src/xrdsvc/HttpWorkerCzarModule.cc @@ -120,11 +120,17 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { QueryId ujQueryId = uberJobMsg->getQueryId(); int ujRowLimit = uberJobMsg->getRowLimit(); auto targetWorkerId = uberJobMsg->getWorkerId(); + uint64_t maxTableSizeMb = uberJobMsg->getMaxTableSizeMb(); + uint64_t const MB_SIZE_BYTES = 1024 * 1024; + uint64_t maxTableSizeBytes = maxTableSizeMb * MB_SIZE_BYTES; // Get or create QueryStatistics and UserQueryInfo instances. auto queryStats = foreman()->getQueriesAndChunks()->addQueryId(ujQueryId, ujCzInfo->czId); auto userQueryInfo = queryStats->getUserQueryInfo(); LOGS(_log, LOG_LVL_WARN, uberJobMsg->getIdStr() << " &&& added to stats"); + LOGS(_log, LOG_LVL_WARN, + uberJobMsg->getIdStr() << " &&& bytesWritten added to stats maxTableSizeMb=" << maxTableSizeMb + << " maxTableSizeBytes=" << maxTableSizeBytes); if (userQueryInfo->getCancelledByCzar()) { throw wbase::TaskException( @@ -136,8 +142,8 @@ json HttpWorkerCzarModule::_handleQueryJob(string const& func) { } auto ujData = wbase::UberJobData::create(ujId, ujCzInfo->czName, ujCzInfo->czId, ujCzInfo->czHostName, - ujCzInfo->czPort, ujQueryId, ujRowLimit, targetWorkerId, - foreman(), authKey()); + ujCzInfo->czPort, ujQueryId, ujRowLimit, maxTableSizeBytes, + targetWorkerId, foreman(), authKey()); LOGS(_log, LOG_LVL_WARN, uberJobMsg->getIdStr() << " &&& ujData created"); // Find the entry for this queryId, create a new one if needed.