mirror of
https://git.openafs.org/openafs.git
synced 2025-01-21 00:10:15 +00:00
Windows: Make CM resilient to transient VNOVOL
The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which are no indicative of the volume not being present. For example, VNOVOL can be sent during a transition to a VBUSY state prior to salvaging or when cloning a .backup volume instance. As a result the cache manager must attempt at least one retry when a VNOVOL is receive but there are no changes to the volume location information. This patchset records the VNOVOL error in the cm_req_t structure If the volume is replicated, the volume's server reference into a busy state. If the volume is not replicated, the thread is paused for two seconds. In both cases, the request is retried. If the VNOVOL error is received a second time from the same server, the volume server reference is deleted as before. This is done to prevent repeated requests to the VLDB server and the file server that are expected to fail. The server reference will be restored to the volume on the next volume location update. Change-Id: Ica51f853683f80cb17c804cdc216f7a113cca60a Reviewed-on: http://gerrit.openafs.org/7353 Tested-by: BuildBot <buildbot@rampaginggeek.com> Tested-by: Jeffrey Altman <jaltman@secure-endpoints.com> Reviewed-by: Jeffrey Altman <jaltman@secure-endpoints.com>
This commit is contained in:
parent
46c1f1391b
commit
1af906799b
@ -688,25 +688,53 @@ cm_Analyze(cm_conn_t *connp,
|
||||
|
||||
if (cm_ServerEqual(tsrp->server, serverp)) {
|
||||
/* REDIRECT */
|
||||
if (errorCode == VMOVED || errorCode == VNOVOL) {
|
||||
osi_Log2(afsd_logp, "volume %d not present on server %s",
|
||||
switch (errorCode) {
|
||||
case VMOVED:
|
||||
osi_Log2(afsd_logp, "volume %u moved from server %s",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr));
|
||||
tsrp->status = srv_deleted;
|
||||
if (fidp)
|
||||
cm_RemoveVolumeFromServer(serverp, fidp->volume);
|
||||
} else {
|
||||
osi_Log2(afsd_logp, "volume %d instance on server %s marked offline",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr));
|
||||
tsrp->status = srv_offline;
|
||||
break;
|
||||
case VNOVOL:
|
||||
/*
|
||||
* The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which
|
||||
* are no indicative of the volume not being present. For example,
|
||||
* VNOVOL can be sent during a transition to a VBUSY state prior to
|
||||
* salvaging or when cloning a .backup volume instance. As a result
|
||||
* the cache manager must attempt at least one retry when a VNOVOL is
|
||||
* receive but there are no changes to the volume location information.
|
||||
*/
|
||||
if (reqp->vnovolError > 0 && cm_ServerEqual(reqp->errorServp, serverp)) {
|
||||
osi_Log2(afsd_logp, "volume %u not present on server %s",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr));
|
||||
tsrp->status = srv_deleted;
|
||||
if (fidp)
|
||||
cm_RemoveVolumeFromServer(serverp, fidp->volume);
|
||||
} else {
|
||||
osi_Log2(afsd_logp, "VNOVOL received for volume %u from server %s",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr));
|
||||
if (replicated) {
|
||||
cm_SetServerBusyStatus(serversp, serverp);
|
||||
} else {
|
||||
Sleep(2000);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
osi_Log3(afsd_logp, "volume %u exists on server %s with status %u",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
|
||||
}
|
||||
/* break; */
|
||||
} else {
|
||||
osi_Log3(afsd_logp, "volume %d exists on server %s with status %u",
|
||||
fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
|
||||
}
|
||||
}
|
||||
lock_ReleaseWrite(&cm_serverLock);
|
||||
|
||||
/* Remember that the VNOVOL error occurred */
|
||||
if (errorCode == VNOVOL) {
|
||||
reqp->errorServp = serverp;
|
||||
reqp->vnovolError++;
|
||||
}
|
||||
|
||||
/* Free the server list before cm_ForceUpdateVolume is called */
|
||||
if (free_svr_list) {
|
||||
cm_FreeServerList(serverspp, 0);
|
||||
@ -779,7 +807,7 @@ cm_Analyze(cm_conn_t *connp,
|
||||
LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
|
||||
osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]",
|
||||
osi_LogSaveString(afsd_logp,addr));
|
||||
reqp->tokenIdleErrorServp = serverp;
|
||||
reqp->errorServp = serverp;
|
||||
reqp->idleError++;
|
||||
}
|
||||
|
||||
@ -947,7 +975,7 @@ cm_Analyze(cm_conn_t *connp,
|
||||
}
|
||||
|
||||
if (replicated && serverp) {
|
||||
reqp->tokenIdleErrorServp = serverp;
|
||||
reqp->errorServp = serverp;
|
||||
reqp->tokenError = errorCode;
|
||||
|
||||
if (timeLeft > 2)
|
||||
@ -1023,7 +1051,7 @@ cm_Analyze(cm_conn_t *connp,
|
||||
|
||||
if (serverp) {
|
||||
if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
|
||||
reqp->tokenIdleErrorServp = serverp;
|
||||
reqp->errorServp = serverp;
|
||||
reqp->tokenError = errorCode;
|
||||
} else {
|
||||
reqp->flags |= CM_REQ_NEW_CONN_FORCED;
|
||||
@ -1071,7 +1099,7 @@ cm_Analyze(cm_conn_t *connp,
|
||||
errorCode, s);
|
||||
|
||||
if (serverp) {
|
||||
reqp->tokenIdleErrorServp = serverp;
|
||||
reqp->errorServp = serverp;
|
||||
reqp->tokenError = errorCode;
|
||||
retry = 1;
|
||||
}
|
||||
@ -1084,7 +1112,7 @@ cm_Analyze(cm_conn_t *connp,
|
||||
* and force the use of another server.
|
||||
*/
|
||||
if (serverp) {
|
||||
reqp->tokenIdleErrorServp = serverp;
|
||||
reqp->errorServp = serverp;
|
||||
reqp->tokenError = errorCode;
|
||||
retry = 1;
|
||||
}
|
||||
@ -1280,15 +1308,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_
|
||||
continue;
|
||||
|
||||
tsp = tsrp->server;
|
||||
if (reqp->tokenIdleErrorServp) {
|
||||
if (reqp->errorServp) {
|
||||
/*
|
||||
* search the list until we find the server
|
||||
* that failed last time. When we find it
|
||||
* clear the error, skip it and try the next one
|
||||
* in the list.
|
||||
*/
|
||||
if (tsp == reqp->tokenIdleErrorServp)
|
||||
reqp->tokenIdleErrorServp = NULL;
|
||||
if (tsp == reqp->errorServp)
|
||||
reqp->errorServp = NULL;
|
||||
continue;
|
||||
}
|
||||
if (tsp) {
|
||||
|
@ -62,9 +62,10 @@ typedef struct cm_req {
|
||||
int rpcError; /* RPC error code */
|
||||
int volumeError; /* volume error code */
|
||||
int accessError; /* access error code */
|
||||
struct cm_server * tokenIdleErrorServp; /* server that reported a token/idle error other than expired */
|
||||
struct cm_server * errorServp; /* server that reported a token/idle error other than expired */
|
||||
int tokenError;
|
||||
int idleError;
|
||||
int vnovolError;
|
||||
afs_uint32 flags;
|
||||
clientchar_t * tidPathp;
|
||||
clientchar_t * relPathp;
|
||||
|
Loading…
Reference in New Issue
Block a user