Skip to content

Commit

Permalink
Fix for race condition from babysit cron job (#49)
Browse files Browse the repository at this point in the history
- Fix race condition from babysit cron job
  • Loading branch information
chris-gilmore authored and chaochenq committed Sep 6, 2016
1 parent 9c29857 commit 7d1429b
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 22 deletions.
21 changes: 8 additions & 13 deletions bin/aws-kinesis-agent-babysit
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,19 @@

PATH=/sbin:/usr/sbin:/bin:/usr/bin:$PATH
DAEMON_NAME=aws-kinesis-agent
PIDFILE=/var/run/$DAEMON_NAME.pid
SERVICE="service $DAEMON_NAME"

function get_agent_pid() {
echo $(ps --ppid $(cat $PIDFILE 2>/dev/null) ho pid 2>/dev/null)
}

function start_agent() {
$SERVICE restart || exit 1
sleep 3
[[ -n $(get_agent_pid) ]] || exit 1
$SERVICE status >/dev/null 2>&1 || exit 1
}

# Check if PID file exists.
# If it does not, it means either the agent was never started or it was stopped by the user.
[[ -f $PIDFILE ]] || exit 0
# Check if the child Java process is alive. If not, we should start
[[ -n $(get_agent_pid) ]] || start_agent
$SERVICE status >/dev/null 2>&1
status=$?

if [ "$status" -eq "1" -o "$status" -eq "2" ]; then
start_agent
fi

exit 0
exit 0
25 changes: 16 additions & 9 deletions bin/aws-kinesis-agent.RedHat
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ do_start () {
export AWS_SECRET_ACCESS_KEY
export AWS_DEFAULT_REGION

(
flock -w 10 -x 9
DAEMON_NAME=$DAEMON_NAME nohup runuser $AGENT_USER -s /bin/sh -c "$DAEMON_EXEC -L $AGENT_LOG_LEVEL $AGENT_ARGS $@" > $INITLOGFILE 2>&1 &

pid=$!
Expand All @@ -90,9 +88,7 @@ do_start () {
# output status message
[[ $RETVAL == 0 ]] && success "$DAEMON_NAME startup" || failure "$DAEMON_NAME startup"

) 9>$MUTEXFILE
RETVAL=$?
rm -f $MUTEXFILE
return $RETVAL
}

Expand All @@ -104,8 +100,6 @@ get_pids() {
}

do_stop () {
(
flock -w 10 -x 9
ppids=`get_pids | awk '{print $1}'`
if [[ $? == 0 ]]; then
for pid in $ppids; do
Expand Down Expand Up @@ -142,15 +136,22 @@ do_stop () {

# print status message
[[ $RETVAL == 0 ]] && success "$DAEMON_NAME shutdown" || failure "$DAEMON_NAME shutdown"
) 9>$MUTEXFILE

RETVAL=$?
rm -f $MUTEXFILE
return $RETVAL
}

function get_agent_pid() {
echo $(ps --ppid $(cat $PIDFILE 2>/dev/null) ho pid 2>/dev/null)
}

do_status () {
status -p $PIDFILE $DAEMON_NAME
RETVAL=$?

if [[ $RETVAL = 0 && -z $(get_agent_pid) ]]; then
RETVAL=1
fi
}

do_restart () {
Expand Down Expand Up @@ -191,6 +192,11 @@ do_install () {
echo "$DAEMON_NAME log file will be found at: $LOG_DIR"
}

exec 200>$MUTEXFILE
if ! flock -w 50 200; then
exit 1
fi
(
command=$1
shift
case "$command" in
Expand All @@ -217,4 +223,5 @@ case "$command" in
exit 1
;;
esac
exit $RETVAL
exit $RETVAL
) 200>&-

0 comments on commit 7d1429b

Please sign in to comment.