From 7f251877c910509940252b431992f3ad069ab46a Mon Sep 17 00:00:00 2001 From: Mark Vitale Date: Wed, 10 Oct 2018 23:17:13 -0400 Subject: [PATCH] bozo: mark failed bnodes as stopped/shutdown Commit 466e8cb15e794e13ef27a0aaa7be6abad499d66d ('bozo: retry start after error stops') introduced several problems for 'simple' bnodes that experience startup errors. After its error retries have been exhausted, the bnode continues to appear as if it is starting up. For instance, 'bos stop' is required before 'bos delete' will work. Also, if 'bos stop -wait' is issued for a different bnode, the command will hang due to BOZO_WaitAll waiting indefinitely for the bnode that has exhausted its error retries. Instead, introduce bnode_IsErrorRetrying and modify ez_getstat to call it. In this way ex_getstat will only return BSTAT_STARTINGUP if the error retries have not been exhausted yet. While here, also modify ez_procexit and SetNeedsClock to consolidate all equivalent logic in bnode_IsErrorRetrying. Change-Id: I29d419d76a889e13049116fa66d1a63d11c16b46 Reviewed-on: https://gerrit.openafs.org/13376 Tested-by: BuildBot Reviewed-by: Michael Meffie Reviewed-by: Cheyenne Wills --- src/bozo/bnode.c | 12 ++++++++++++ src/bozo/bnode_internal.h | 1 + src/bozo/ezbnodeops.c | 4 ++-- src/bozo/fsbnodeops.c | 2 +- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/bozo/bnode.c b/src/bozo/bnode.c index 0341911f5e..c038fe3108 100644 --- a/src/bozo/bnode.c +++ b/src/bozo/bnode.c @@ -537,6 +537,18 @@ bnode_Delete(struct bnode *abnode) return code; } +/* Are we still doing error retries ? */ +int +bnode_IsErrorRetrying(struct bnode *abnode) +{ + if ((abnode->flags & BNODE_ERRORSTOP) != 0 + && abnode->errorStopDelay != 0) { + return 1; /* still doing error retries */ + } else { + return 0; /* no error retries */ + } +} + /* function to tell if there's a timeout coming up */ int bnode_PendingTimeout(struct bnode *abnode) diff --git a/src/bozo/bnode_internal.h b/src/bozo/bnode_internal.h index 31d8c9f80a..b23d8667bc 100644 --- a/src/bozo/bnode_internal.h +++ b/src/bozo/bnode_internal.h @@ -158,3 +158,4 @@ extern int bnode_SetStat(struct bnode *abnode, int agoal); extern int bnode_CreatePidFile(struct bnode *abnode, struct bnode_proc *aproc, char *name); extern int bnode_DestroyPidFile(struct bnode *abnode, struct bnode_proc *aproc); extern int bnode_ResetErrorCount(struct bnode *abnode); +extern int bnode_IsErrorRetrying(struct bnode *abnode); diff --git a/src/bozo/ezbnodeops.c b/src/bozo/ezbnodeops.c index 87a64aebb5..835449a8ea 100644 --- a/src/bozo/ezbnodeops.c +++ b/src/bozo/ezbnodeops.c @@ -158,7 +158,7 @@ ez_getstat(struct bnode *bn, afs_int32 * astatus) temp = BSTAT_SHUTTINGDOWN; else if (abnode->running) temp = BSTAT_NORMAL; - else if (abnode->b.flags & BNODE_ERRORSTOP) + else if (bnode_IsErrorRetrying(bn)) temp = BSTAT_STARTINGUP; else temp = BSTAT_SHUTDOWN; @@ -225,7 +225,7 @@ ez_procexit(struct bnode *bn, struct bnode_proc *aproc) bnode_SetTimeout((struct bnode *) abnode, 0); /* clear timer */ if (abnode->b.goal) code = ez_setstat((struct bnode *) abnode, BSTAT_NORMAL); - else if (abnode->b.flags & BNODE_ERRORSTOP && abnode->b.errorStopDelay) { + else if (bnode_IsErrorRetrying(bn)) { ViceLog(0, ("%s will retry start in %d seconds\n", abnode->b.name, abnode->b.errorStopDelay)); bnode_SetTimeout(bn, abnode->b.errorStopDelay); diff --git a/src/bozo/fsbnodeops.c b/src/bozo/fsbnodeops.c index 8e1c15e6be..2155e775d7 100644 --- a/src/bozo/fsbnodeops.c +++ b/src/bozo/fsbnodeops.c @@ -881,7 +881,7 @@ SetNeedsClock(struct fsbnode *ab) } } else if ((ab->b.goal == 0) && !ab->fileRunning && !ab->volRunning && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning) { - if (ab->b.flags & BNODE_ERRORSTOP && ab->b.errorStopDelay) { + if (bnode_IsErrorRetrying(&ab->b)) { ViceLog(0, ("%s will retry start in %d seconds\n", ab->b.name, ab->b.errorStopDelay)); ab->needsClock = 1; /* halted for errors, retry later */