mirror of
https://git.openafs.org/openafs.git
synced 2025-01-18 15:00:12 +00:00
bozo: retry start after error stops
After a bnode is stopped because of two many consecutive exits
delay for some time and attempt to start the bnode again. Countine
to retry on each error stop, doubling the delay for each retry
attempt until a maxium number of attempts.
Reviewed-on: http://gerrit.openafs.org/5534
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Derrick Brashear <shadow@dementix.org>
(cherry picked from commit 170ce3db8a
)
Change-Id: I4f3863c31318ba9c5950ef74ec0a04b31decfebb
Reviewed-on: http://gerrit.openafs.org/9571
Reviewed-by: Derrick Brashear <shadow@your-file-system.com>
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Stephan Wiesand <stephan.wiesand@desy.de>
This commit is contained in:
parent
5ef139d769
commit
466e8cb15e
@ -46,6 +46,7 @@
|
||||
#endif
|
||||
|
||||
#define BNODE_LWP_STACKSIZE (16 * 1024)
|
||||
#define BNODE_ERROR_COUNT_MAX 16 /* maximum number of retries */
|
||||
|
||||
int bnode_waiting = 0;
|
||||
static PROCESS bproc_pid; /* pid of waker-upper */
|
||||
@ -273,6 +274,14 @@ bnode_WaitStatus(struct bnode *abnode, int astatus)
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
bnode_ResetErrorCount(struct bnode *abnode)
|
||||
{
|
||||
abnode->errorStopCount = 0;
|
||||
abnode->errorStopDelay = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
bnode_SetStat(struct bnode *abnode, int agoal)
|
||||
{
|
||||
@ -620,13 +629,14 @@ bproc(void *unused)
|
||||
tb = tp->bnode;
|
||||
bnode_Hold(tb);
|
||||
|
||||
/* count restarts in last 10 seconds */
|
||||
/* count restarts in last 30 seconds */
|
||||
if (temp > tb->rsTime + 30) {
|
||||
/* it's been 10 seconds we've been counting */
|
||||
/* it's been 30 seconds we've been counting */
|
||||
tb->rsTime = temp;
|
||||
tb->rsCount = 0;
|
||||
}
|
||||
|
||||
|
||||
if (WIFSIGNALED(status) == 0) {
|
||||
/* exited, not signalled */
|
||||
tp->lastExit = WEXITSTATUS(status);
|
||||
@ -676,17 +686,27 @@ bproc(void *unused)
|
||||
tb->notifier);
|
||||
hdl_notifier(tp);
|
||||
}
|
||||
BOP_PROCEXIT(tb, tp);
|
||||
|
||||
bnode_Check(tb);
|
||||
if (tb->rsCount++ > 10) {
|
||||
/* 10 in 10 seconds */
|
||||
if (tb->goal && tb->rsCount++ > 10) {
|
||||
/* 10 in 30 seconds */
|
||||
if (tb->errorStopCount >= BNODE_ERROR_COUNT_MAX) {
|
||||
tb->errorStopDelay = 0; /* max reached, give up. */
|
||||
} else {
|
||||
tb->errorStopCount++;
|
||||
if (!tb->errorStopDelay) {
|
||||
tb->errorStopDelay = 1;
|
||||
} else {
|
||||
tb->errorStopDelay *= 2;
|
||||
}
|
||||
}
|
||||
tb->flags |= BNODE_ERRORSTOP;
|
||||
bnode_SetGoal(tb, BSTAT_SHUTDOWN);
|
||||
bozo_Log
|
||||
("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
|
||||
tb->name);
|
||||
}
|
||||
BOP_PROCEXIT(tb, tp);
|
||||
bnode_Check(tb);
|
||||
bnode_Release(tb); /* bnode delete can happen here */
|
||||
DeleteProc(tp);
|
||||
} else
|
||||
@ -987,6 +1007,7 @@ bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName,
|
||||
free(tp);
|
||||
return errno;
|
||||
}
|
||||
bozo_Log("%s started pid %ld: %s\n", abnode->name, cpid, aexecString);
|
||||
|
||||
bnode_FreeTokens(tlist);
|
||||
allProcs = tp;
|
||||
|
@ -67,6 +67,8 @@ struct bnode {
|
||||
short flags; /* random flags */
|
||||
char goal; /* 1=running or 0=not running */
|
||||
char fileGoal; /* same, but to be stored in file */
|
||||
afs_int32 errorStopCount; /* number of recent error stops */
|
||||
afs_int32 errorStopDelay; /* seconds to wait before retrying start */
|
||||
};
|
||||
|
||||
struct bnode_proc {
|
||||
@ -140,3 +142,4 @@ extern int bnode_WaitStatus(struct bnode *abnode, int astatus);
|
||||
extern int bnode_SetStat(struct bnode *abnode, int agoal);
|
||||
extern int bnode_CreatePidFile(struct bnode *abnode, struct bnode_proc *aproc, char *name);
|
||||
extern int bnode_DestroyPidFile(struct bnode *abnode, struct bnode_proc *aproc);
|
||||
extern int bnode_ResetErrorCount(struct bnode *abnode);
|
||||
|
@ -869,6 +869,7 @@ stproc(struct bnode *abnode, void *arock)
|
||||
return 0; /* don't do these guys */
|
||||
|
||||
bnode_Hold(abnode);
|
||||
bnode_ResetErrorCount(abnode);
|
||||
bnode_SetStat(abnode, BSTAT_NORMAL);
|
||||
bnode_Release(abnode);
|
||||
return 0;
|
||||
@ -1027,6 +1028,7 @@ SBOZO_Restart(struct rx_call *acall, char *ainstance)
|
||||
bnode_Hold(tb);
|
||||
bnode_SetStat(tb, BSTAT_SHUTDOWN);
|
||||
code = bnode_WaitStatus(tb, BSTAT_SHUTDOWN); /* this can fail */
|
||||
bnode_ResetErrorCount(tb);
|
||||
bnode_SetStat(tb, BSTAT_NORMAL);
|
||||
bnode_Release(tb);
|
||||
|
||||
@ -1056,6 +1058,7 @@ SBOZO_SetTStatus(struct rx_call *acall, char *ainstance, afs_int32 astatus)
|
||||
goto fail;
|
||||
}
|
||||
bnode_Hold(tb);
|
||||
bnode_ResetErrorCount(tb);
|
||||
code = bnode_SetStat(tb, astatus);
|
||||
bnode_Release(tb);
|
||||
|
||||
|
@ -43,6 +43,7 @@ static int ez_getparm(struct bnode *bnode, afs_int32, char *, afs_int32);
|
||||
static int ez_procstarted(struct bnode *bnode, struct bnode_proc *proc);
|
||||
|
||||
#define SDTIME 60 /* time in seconds given to a process to evaporate */
|
||||
#define ERROR_RESET_TIME 60 /* time in seconds to wait before resetting error count state */
|
||||
|
||||
struct bnode_ops ezbnode_ops = {
|
||||
ez_create,
|
||||
@ -128,18 +129,27 @@ ez_create(char *ainstance, char *acommand, char *unused1, char *unused2,
|
||||
return (struct bnode *)te;
|
||||
}
|
||||
|
||||
/* called to SIGKILL a process if it doesn't terminate normally */
|
||||
/* called to SIGKILL a process if it doesn't terminate normally
|
||||
* or to retry start after an error stop. */
|
||||
static int
|
||||
ez_timeout(struct bnode *bn)
|
||||
{
|
||||
struct ezbnode *abnode = (struct ezbnode *)bn;
|
||||
|
||||
if (!abnode->waitingForShutdown)
|
||||
return 0; /* spurious */
|
||||
/* send kill and turn off timer */
|
||||
bnode_StopProc(abnode->proc, SIGKILL);
|
||||
abnode->killSent = 1;
|
||||
bnode_SetTimeout((struct bnode *)abnode, 0);
|
||||
if (abnode->waitingForShutdown) {
|
||||
/* send kill and turn off timer */
|
||||
bnode_StopProc(abnode->proc, SIGKILL);
|
||||
abnode->killSent = 1;
|
||||
bnode_SetTimeout((struct bnode *)abnode, 0);
|
||||
} else if (!abnode->running && abnode->b.flags & BNODE_ERRORSTOP) {
|
||||
/* was stopped for too many errors, retrying */
|
||||
/* reset error count after running for a bit */
|
||||
bnode_SetTimeout(bn, ERROR_RESET_TIME);
|
||||
bnode_SetStat(bn, BSTAT_NORMAL);
|
||||
} else {
|
||||
bnode_SetTimeout(bn, 0); /* one shot timer */
|
||||
bnode_ResetErrorCount(bn);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -153,6 +163,8 @@ ez_getstat(struct bnode *bn, afs_int32 * astatus)
|
||||
temp = BSTAT_SHUTTINGDOWN;
|
||||
else if (abnode->running)
|
||||
temp = BSTAT_NORMAL;
|
||||
else if (abnode->b.flags & BNODE_ERRORSTOP)
|
||||
temp = BSTAT_STARTINGUP;
|
||||
else
|
||||
temp = BSTAT_SHUTDOWN;
|
||||
*astatus = temp;
|
||||
@ -205,7 +217,7 @@ ez_procexit(struct bnode *bn, struct bnode_proc *aproc)
|
||||
struct ezbnode *abnode = (struct ezbnode *)bn;
|
||||
|
||||
/* process has exited */
|
||||
afs_int32 code;
|
||||
afs_int32 code = 0;
|
||||
|
||||
if (DoPidFiles) {
|
||||
bozo_DeletePidFile(bn->name, NULL);
|
||||
@ -218,8 +230,11 @@ ez_procexit(struct bnode *bn, struct bnode_proc *aproc)
|
||||
bnode_SetTimeout((struct bnode *) abnode, 0); /* clear timer */
|
||||
if (abnode->b.goal)
|
||||
code = ez_setstat((struct bnode *) abnode, BSTAT_NORMAL);
|
||||
else
|
||||
code = 0;
|
||||
else if (abnode->b.flags & BNODE_ERRORSTOP && abnode->b.errorStopDelay) {
|
||||
bozo_Log("%s will retry start in %d seconds\n", abnode->b.name,
|
||||
abnode->b.errorStopDelay);
|
||||
bnode_SetTimeout(bn, abnode->b.errorStopDelay);
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
|
@ -472,7 +472,7 @@ fs_create(char *ainstance, char *afilecmd, char *avolcmd, char *asalcmd,
|
||||
goto done;
|
||||
}
|
||||
bnode_SetTimeout(fsbnode2bnode(te), POLLTIME);
|
||||
/* ask for timeout activations every 10 seconds */
|
||||
/* ask for timeout activations every 20 seconds */
|
||||
RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
|
||||
SetNeedsClock(te); /* compute needsClock field */
|
||||
|
||||
@ -601,7 +601,7 @@ dafs_create(char *ainstance, char *afilecmd, char *avolcmd,
|
||||
goto done;
|
||||
}
|
||||
bnode_SetTimeout(fsbnode2bnode(te), POLLTIME);
|
||||
/* ask for timeout activations every 10 seconds */
|
||||
/* ask for timeout activations every 20 seconds */
|
||||
RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
|
||||
SetNeedsClock(te); /* compute needsClock field */
|
||||
|
||||
@ -680,6 +680,16 @@ fs_timeout(struct bnode *bn)
|
||||
SDTIME);
|
||||
}
|
||||
}
|
||||
|
||||
if ((abnode->b.flags & BNODE_ERRORSTOP) && !abnode->salRunning
|
||||
&& !abnode->volRunning && !abnode->fileRunning && !abnode->scanRunning
|
||||
&& !abnode->salsrvRunning) {
|
||||
bnode_SetStat(bn, BSTAT_NORMAL);
|
||||
}
|
||||
else {
|
||||
bnode_ResetErrorCount(bn);
|
||||
}
|
||||
|
||||
SetNeedsClock(abnode);
|
||||
return 0;
|
||||
}
|
||||
@ -784,6 +794,8 @@ fs_procexit(struct bnode *bn, struct bnode_proc *aproc)
|
||||
static void
|
||||
SetNeedsClock(struct fsbnode *ab)
|
||||
{
|
||||
afs_int32 timeout = POLLTIME;
|
||||
|
||||
if ((ab->fileSDW && !ab->fileKillSent) || (ab->volSDW && !ab->volKillSent)
|
||||
|| (ab->scanSDW && !ab->scanKillSent) || (ab->salSDW && !ab->salKillSent)
|
||||
|| (ab->salsrvSDW && !ab->salsrvKillSent)) {
|
||||
@ -791,15 +803,29 @@ SetNeedsClock(struct fsbnode *ab)
|
||||
ab->needsClock = 1;
|
||||
} else if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
|
||||
&& (!ab->scancmd || ab->scanRunning)
|
||||
&& (!ab->salsrvcmd || ab->salsrvRunning))
|
||||
ab->needsClock = 0; /* running normally */
|
||||
else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
|
||||
&& !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning)
|
||||
ab->needsClock = 0; /* halted normally */
|
||||
else
|
||||
&& (!ab->salsrvcmd || ab->salsrvRunning)) {
|
||||
if (ab->b.errorStopCount) {
|
||||
/* reset error count after running for a bit */
|
||||
ab->needsClock = 1;
|
||||
} else {
|
||||
ab->needsClock = 0; /* running normally */
|
||||
}
|
||||
} else if ((ab->b.goal == 0) && !ab->fileRunning && !ab->volRunning
|
||||
&& !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning) {
|
||||
if (ab->b.flags & BNODE_ERRORSTOP && ab->b.errorStopDelay) {
|
||||
bozo_Log("%s will retry start in %d seconds\n", ab->b.name,
|
||||
ab->b.errorStopDelay);
|
||||
ab->needsClock = 1; /* halted for errors, retry later */
|
||||
timeout = ab->b.errorStopDelay;
|
||||
} else {
|
||||
ab->needsClock = 0; /* halted normally */
|
||||
}
|
||||
} else
|
||||
ab->needsClock = 1; /* other */
|
||||
if (ab->needsClock && !bnode_PendingTimeout(fsbnode2bnode(ab)))
|
||||
bnode_SetTimeout(fsbnode2bnode(ab), POLLTIME);
|
||||
|
||||
if (ab->needsClock && (!bnode_PendingTimeout(fsbnode2bnode(ab))
|
||||
|| ab->b.period != timeout))
|
||||
bnode_SetTimeout(fsbnode2bnode(ab), timeout);
|
||||
if (!ab->needsClock)
|
||||
bnode_SetTimeout(fsbnode2bnode(ab), 0);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user