Skip to content

Commit

Permalink
Merge pull request #1 from IBM/master
Browse files Browse the repository at this point in the history
sync
  • Loading branch information
tgooding authored Jul 21, 2018
2 parents 94b8276 + 72fc659 commit 374e258
Show file tree
Hide file tree
Showing 166 changed files with 3,764 additions and 5,556 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ endforeach(LVL)

# csmd RPM packaging customization
include (csmd/setupRPM.cmake)
include (csm_big_data/setupRPM.cmake)

include (scripts/setupDoxygen.cmake)
include (scripts/setupTargets.cmake)
Expand Down
6 changes: 3 additions & 3 deletions bb/include/bbapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ extern int BB_TerminateLibrary();
* specified by the caller.
* The last error details are _thread local_. Each thread has its separate and distinct copy of a "last error" string.
* A thread invoking a burst buffer API will get a different "last error" string than another thread invoking burst buffer APIs.
*
*
*
*
*
* Only details from the last bbAPI call performed on that software thread are returned. If the process
* is multi-threaded, the error information is tracked separately between the threads.
Expand Down Expand Up @@ -246,7 +246,7 @@ extern int BB_StartTransfer(BBTransferDef_t* transfer, BBTransferHandle_t handle
* The BB_CancelTransfer routine cancels an existing asynchronous file transfers specified by
* the transfer handle. When the call returns, the transfer has been stopped or an error has
* occurred. As part of the cancel, any parts of the files that have been transferred will
* have been deleted from the target location.
* have been deleted from the PFS target location.
*
* \param[in] handle Transfer handle from BB_StartTransfer. All transfers matching the tag will be canceled.
* \param[in] scope Specifies the scope of the cancel. (See BBCANCELSCOPE for possible values.)
Expand Down
Empty file modified bb/scripts/bbhealth.pl
100755 → 100644
Empty file.
53 changes: 53 additions & 0 deletions bb/scripts/fuseUmount.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
###########################################################
# fuseUmount.sh
#
# Copyright IBM Corporation 2015,2016. All Rights Reserved
#
# This program is licensed under the terms of the Eclipse Public License
# v1.0 as published by the Eclipse Foundation and available at
# http://www.eclipse.org/legal/epl-v10.html
#
# U.S. Government Users Restricted Rights: Use, duplication or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.
###########################################################

umask 0027

# exit on any script failure
#set -eo pipefail
BSCFS_MNT_PATH="/bscfs"
UNMOUNTRETRIES=12
SLEEPSEC=5
if [ "$#" -ne 0 ]; then
BSCFS_MNT_PATH=$1
fi
grep -c $BSCFS_MNT_PATH /proc/mounts &>>/dev/null
grepRC=$?
if [ "$grepRC" -ne 0 ] ; then
echo $BSCFS_MNT_PATH not mounted `date` > /tmp/fuseUnmount.debug
exit 0
fi

if [ "$#" -gt 1 ]; then
UNMOUNTRETRIES=$2
fi

if [ "$#" -gt 2 ]; then
SLEEPSEC=$3
fi

echo $0 BSCFS_MNT_PATH=$BSCFS_MNT_PATH UNMOUNTRETRIES=$UNMOUNTRETRIES SLEEPSEC=$SLEEPSEC > /tmp/fuseUnmount.debug
date >> /tmp/fuseUnmount.debug
CMDrc=29
for (( i=1; i<=$UNMOUNTRETRIES; i++ ))
do
fusermount -u $BSCFS_MNT_PATH &>> /tmp/fuseUnmount.debug
CMDrc=$?
if [ CMDrc == 0 ]; then
exit 0
fi
echo CMDrc=$CMDrc &>> /tmp/fuseUnmount.debug
sleep($SLEEPSEC)
done
exit CMDrc
2 changes: 1 addition & 1 deletion bb/src/BBLVKey_ExtentInfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ void BBLVKey_ExtentInfo::dumpInFlight(const char* pSev) const {
return;
}

Extent* BBLVKey_ExtentInfo::getAnyExtent(const uint64_t pHandle, const uint32_t pContribId, const uint32_t pSourceIndex)
Extent* BBLVKey_ExtentInfo::getAnySourceExtent(const uint64_t pHandle, const uint32_t pContribId, const uint32_t pSourceIndex)
{
Extent* l_Extent = 0;

Expand Down
2 changes: 1 addition & 1 deletion bb/src/BBLVKey_ExtentInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class BBLVKey_ExtentInfo
void dump(const char* pSev, const char* pPrefix=0) const;
void dumpExtents(const char* pSev, const char* pPrefix=0) const;
void dumpInFlight(const char* pSev) const;
Extent* getAnyExtent(const uint64_t pHandle, const uint32_t pContribId, const uint32_t pSourceIndex);
Extent* getAnySourceExtent(const uint64_t pHandle, const uint32_t pContribId, const uint32_t pSourceIndex);
Extent* getMaxInFlightExtent();
Extent* getMinimumTrimExtent();
int moreExtentsToTransfer(const int64_t pHandle, const int32_t pContrib, uint32_t pNumberOfExpectedInFlight);
Expand Down
2 changes: 1 addition & 1 deletion bb/src/BBTagInfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ int BBTagInfo::prepareForRestart(const std::string& pConnectionName, const LVKey
LOG(bb,info) << ">>>>> DELAY <<<<< BBTagInfo::prepareForRestart: Attempting to restart a transfer definition for jobid " << pJob.getJobId() \
<< ", jobstepid " << pJob.getJobStepId() << ", handle " << pHandle << ", contribid " << pContribId \
<< ". Waiting for the handle to be marked as stopped. Delay of 1 second before retry. " << l_Continue \
<< " seconds remain before the original bbServer is declared dead.";
<< " seconds remain waiting for the original bbServer to act before an unconditional stop is performed.";
}
unlockTransferQueue(pLVKey, "BBTagInfo::prepareForRestart - Waiting for transfer definition to be marked as stopped");
{
Expand Down
7 changes: 7 additions & 0 deletions bb/src/BBTagInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ class BBTagInfo
return transferHandle;
}

inline void removeTargetFiles(const LVKey* pLVKey, const uint64_t pHandle, const uint32_t pContribId) {
if (pHandle == transferHandle)
{
return parts.removeTargetFiles(pLVKey, pContribId);
}
}

inline void removeTransferDef(const uint32_t pContribId) {
return parts.removeTransferDef(pContribId);
}
Expand Down
119 changes: 60 additions & 59 deletions bb/src/BBTagInfo2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int BBTagInfo2::allExtentsTransferred(const BBTagID& pTagId)
return rc;
}

void BBTagInfo2::cancelExtents(const LVKey* pLVKey, uint64_t* pHandle, uint32_t* pContribId)
void BBTagInfo2::cancelExtents(const LVKey* pLVKey, uint64_t* pHandle, uint32_t* pContribId, const int pRemoveOption)
{
// Sort the extents, moving the canceled extents to the front of
// the work queue so they are immediately removed...
Expand All @@ -83,6 +83,30 @@ void BBTagInfo2::cancelExtents(const LVKey* pLVKey, uint64_t* pHandle, uint32_t*
// Indicate that next findWork() needs to look for canceled extents
wrkqmgr.setCheckForCanceledExtents(1);

// If we are to perform remove operations for target PFS files, do so now...
if (pRemoveOption == REMOVE_TARGET_PFS_FILES)
{
// Wait for the canceled extents to be processed
while (1)
{
if (wrkqmgr.getCheckForCanceledExtents())
{
unlockTransferQueue(pLVKey, "cancelExtents - Waiting for the canceled extents to be processed");
{
usleep((useconds_t)1000000); // Delay 1 second
}
lockTransferQueue(pLVKey, "cancelExtents - Waiting for the canceled extents to be processed");
}
else
{
break;
}
}

// Remove the target files
removeTargetFiles(pLVKey, *pHandle, *pContribId);
}

return;
}

Expand Down Expand Up @@ -795,7 +819,7 @@ void BBTagInfo2::setAllExtentsTransferred(const LVKey* pLVKey, const uint64_t pH
return;
}

void BBTagInfo2::setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const uint64_t pJobStepId, uint64_t pHandle)
void BBTagInfo2::setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const uint64_t pJobStepId, uint64_t pHandle, const int pRemoveOption)
{
if (jobid == pJobId)
{
Expand All @@ -804,7 +828,7 @@ void BBTagInfo2::setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const u
// Sort the extents, moving the canceled extents to the front of
// the work queue so they are immediately removed...
uint32_t l_ContribId = UNDEFINED_CONTRIBID;
cancelExtents(pLVKey, &pHandle, &l_ContribId);
cancelExtents(pLVKey, &pHandle, &l_ContribId, pRemoveOption);
}

return;
Expand All @@ -813,75 +837,52 @@ void BBTagInfo2::setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const u
int BBTagInfo2::setSuspended(const LVKey* pLVKey, const string& pHostName, const int pValue)
{
int rc = 0;
int l_Continue = 120;

if (pHostName == UNDEFINED_HOSTNAME || pHostName == hostname)
{
if (!stageOutStarted())
{
while ((!rc) && l_Continue--)
rc = wrkqmgr.setSuspended(pLVKey, pValue);
switch (rc)
{
rc = wrkqmgr.setSuspended(pLVKey, pValue);
switch (rc)
case 0:
{
case 0:
if ((((flags & BBTI2_Suspended) == 0) && pValue) || ((flags & BBTI2_Suspended) && (!pValue)))
{
if ((((flags & BBTI2_Suspended) == 0) && pValue) || ((flags & BBTI2_Suspended) && (!pValue)))
{
LOG(bb,info) << "BBTagInfo2::setSuspended(): For hostname " << pHostName << ", connection " \
<< connectionName << ", " << *pLVKey << ", jobid " << jobid \
<< " -> Changing from: " << ((flags & BBTI2_Suspended) ? "true" : "false") << " to " << (pValue ? "true" : "false");
}
SET_FLAG(BBTI2_Suspended, pValue);
l_Continue = 0;
LOG(bb,info) << "BBTagInfo2::setSuspended(): For hostname " << pHostName << ", connection " \
<< connectionName << ", " << *pLVKey << ", jobid " << jobid \
<< " -> Changing from: " << ((flags & BBTI2_Suspended) ? "true" : "false") << " to " << (pValue ? "true" : "false");
}
break;
SET_FLAG(BBTI2_Suspended, pValue);
}
break;

case -2:
case -2:
{
// NOTE: For failover cases, it is possible for a setSuspended() request to be issued to this bbServer before any request
// has 'used' the LVKey and required the work queue to be present. We simply tolerate the condition...
string l_Temp = "resume";
if (pValue)
{
// NOTE: For failover cases, it is possible for a setSuspended() request to be issued to this bbServer before any request
// has 'used' the LVKey and required the work queue to be present. For a resume request, we simply tolerate the
// situation as any work queue added later for the LVKey/CN hostname will automatically be in the resumed state.
// Otherwise, for a suspend request, we wait for a total of 2 minutes awaiting an LVKey/work queue to become present.
// \todo - Not sure if this is the right duration... @DLH
if (pValue)
{
// Connection being suspended
if (l_Continue)
{
rc = 0;
unlockTransferQueue(pLVKey, "setSuspended - Waiting for LVKey to be registered");
{
usleep((useconds_t)1000000); // Delay 1 second
}
lockTransferQueue(pLVKey, "setSuspended - Waiting for LVKey to be registered");
}
else
{
rc = -1;
LOG(bb,error) << "BBTagInfo2::setSuspended(): For hostname " << pHostName << ", connection " \
<< connectionName << ", jobid " << jobid << ", work queue not present for " << *pLVKey \
<< ". Failing condition for a suspend operation.";
}
}
else
{
// Connection being resumed
LOG(bb,info) << "BBTagInfo2::setSuspended(): For hostname " << pHostName << ", connection " \
<< connectionName << ", jobid " << jobid << ", work queue not present for " << *pLVKey \
<< ". Tolerated condition for a resume operation.";
break;
}
// Connection being suspended
l_Temp = "suspend";
}
break;
LOG(bb,info) << "BBTagInfo2::setSuspended(): For hostname " << pHostName << ", connection " \
<< connectionName << ", jobid " << jobid << ", work queue not present for " << *pLVKey \
<< ". Tolerated condition for a " << l_Temp << " operation.";
}
break;

case 2:
break;
case 2:
break;

default:
rc = -1;
break;
}
default:
LOG(bb,info) << "BBTagInfo2::setSuspended(): Unexpected return code " << rc \
<< " received for hostname " << pHostName << ", connection " \
<< connectionName << ", jobid " << jobid << ", " << *pLVKey \
<< " when attempting the suspend or resume operation on the work queue.";
rc = -1;
break;
}
}
else
Expand Down Expand Up @@ -943,7 +944,7 @@ int BBTagInfo2::stopTransfer(const LVKey* pLVKey, const string& pHostName, const
//
// Sort the extents, moving the canceled extents to the front of
// the work queue so they are immediately removed...
cancelExtents(pLVKey, &pHandle, &pContribId);
cancelExtents(pLVKey, &pHandle, &pContribId, DO_NOT_REMOVE_TARGET_PFS_FILES);
}
}
else
Expand Down
10 changes: 8 additions & 2 deletions bb/src/BBTagInfo2.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class Extent;
/*******************************************************************************
| Constants
*******************************************************************************/
const int DO_NOT_REMOVE_TARGET_PFS_FILES = 0;
const int REMOVE_TARGET_PFS_FILES = 1;

/*******************************************************************************
| Classes
Expand Down Expand Up @@ -74,7 +76,7 @@ class BBTagInfo2
void accumulateTotalLocalContributorInfo(const uint64_t pHandle, size_t& pTotalContributors, size_t& pTotalLocalReportingContributors);
int allContribsReported(const uint64_t pHandle, const BBTagID& pTagId);
int allExtentsTransferred(const BBTagID& pTagId);
void cancelExtents(const LVKey* pLVKey, uint64_t* pHandle, uint32_t* pContribId);
void cancelExtents(const LVKey* pLVKey, uint64_t* pHandle, uint32_t* pContribId, const int pRemoveOption=DO_NOT_REMOVE_TARGET_PFS_FILES);
void changeServer();
void cleanUpAll(const LVKey* pLVKey);
void dump(char* pSev, const char* pPrefix=0);
Expand All @@ -90,7 +92,7 @@ class BBTagInfo2
void sendTransferCompleteForFileMsg(const string& pConnectionName, const LVKey* pLVKey, ExtentInfo& pExtentInfo, BBTransferDef* pTransferDef);
void sendTransferCompleteForHandleMsg(const string& pHostName, const string& pCN_HostName, const string& pConnectionName, const LVKey* pLVKey, const BBTagID pTagId, const uint64_t pHandle, int& pAppendAsyncRequestFlag, const BBSTATUS pStatus=BBNONE);
void setAllExtentsTransferred(const LVKey* pLVKey, const uint64_t pHandle, const BBLVKey_ExtentInfo& pLVKey_ExtentInfo, const BBTagID pTagId, const int pValue=1);
void setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const uint64_t pJobStepId, uint64_t pHandle);
void setCanceled(const LVKey* pLVKey, const uint64_t pJobId, const uint64_t pJobStepId, uint64_t pHandle, const int pRemoveOption);
int setSuspended(const LVKey* pLVKey, const string& pHostName, const int pValue);
int stopTransfer(const LVKey* pLVKey, const string& pHostName, const uint64_t pJobId, const uint64_t pJobStepId, uint64_t pHandle, uint32_t pContribId);
void updateAllContribsReported(const LVKey* pLVKey);
Expand Down Expand Up @@ -205,6 +207,10 @@ class BBTagInfo2
return extentInfo.removeExtent(pExtent);
}

inline void removeTargetFiles(const LVKey* pLVKey, const uint64_t pHandle, const uint32_t pContribId) {
tagInfoMap.removeTargetFiles(pLVKey, pHandle, pContribId);
}

inline void resetMinTrimAnchorExtent() {
return extentInfo.resetMinTrimAnchorExtent();
}
Expand Down
Loading

0 comments on commit 374e258

Please sign in to comment.