From 1af906799b2de90d41139dadaf2dd654e4fd2df3 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Sat, 5 May 2012 20:46:08 -0400 Subject: [PATCH] Windows: Make CM resilient to transient VNOVOL The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which are no indicative of the volume not being present. For example, VNOVOL can be sent during a transition to a VBUSY state prior to salvaging or when cloning a .backup volume instance. As a result the cache manager must attempt at least one retry when a VNOVOL is receive but there are no changes to the volume location information. This patchset records the VNOVOL error in the cm_req_t structure If the volume is replicated, the volume's server reference into a busy state. If the volume is not replicated, the thread is paused for two seconds. In both cases, the request is retried. If the VNOVOL error is received a second time from the same server, the volume server reference is deleted as before. This is done to prevent repeated requests to the VLDB server and the file server that are expected to fail. The server reference will be restored to the volume on the next volume location update. Change-Id: Ica51f853683f80cb17c804cdc216f7a113cca60a Reviewed-on: http://gerrit.openafs.org/7353 Tested-by: BuildBot Tested-by: Jeffrey Altman Reviewed-by: Jeffrey Altman --- src/WINNT/afsd/cm_conn.c | 64 +++++++++++++++++++++++++++++----------- src/WINNT/afsd/cm_conn.h | 3 +- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index ab95902fae..15f0f502bd 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -688,25 +688,53 @@ cm_Analyze(cm_conn_t *connp, if (cm_ServerEqual(tsrp->server, serverp)) { /* REDIRECT */ - if (errorCode == VMOVED || errorCode == VNOVOL) { - osi_Log2(afsd_logp, "volume %d not present on server %s", + switch (errorCode) { + case VMOVED: + osi_Log2(afsd_logp, "volume %u moved from server %s", fidp->volume, osi_LogSaveString(afsd_logp,addr)); tsrp->status = srv_deleted; if (fidp) cm_RemoveVolumeFromServer(serverp, fidp->volume); - } else { - osi_Log2(afsd_logp, "volume %d instance on server %s marked offline", - fidp->volume, osi_LogSaveString(afsd_logp,addr)); - tsrp->status = srv_offline; + break; + case VNOVOL: + /* + * The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which + * are no indicative of the volume not being present. For example, + * VNOVOL can be sent during a transition to a VBUSY state prior to + * salvaging or when cloning a .backup volume instance. As a result + * the cache manager must attempt at least one retry when a VNOVOL is + * receive but there are no changes to the volume location information. + */ + if (reqp->vnovolError > 0 && cm_ServerEqual(reqp->errorServp, serverp)) { + osi_Log2(afsd_logp, "volume %u not present on server %s", + fidp->volume, osi_LogSaveString(afsd_logp,addr)); + tsrp->status = srv_deleted; + if (fidp) + cm_RemoveVolumeFromServer(serverp, fidp->volume); + } else { + osi_Log2(afsd_logp, "VNOVOL received for volume %u from server %s", + fidp->volume, osi_LogSaveString(afsd_logp,addr)); + if (replicated) { + cm_SetServerBusyStatus(serversp, serverp); + } else { + Sleep(2000); + } + } + break; + default: + osi_Log3(afsd_logp, "volume %u exists on server %s with status %u", + fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status); } - /* break; */ - } else { - osi_Log3(afsd_logp, "volume %d exists on server %s with status %u", - fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status); } } lock_ReleaseWrite(&cm_serverLock); + /* Remember that the VNOVOL error occurred */ + if (errorCode == VNOVOL) { + reqp->errorServp = serverp; + reqp->vnovolError++; + } + /* Free the server list before cm_ForceUpdateVolume is called */ if (free_svr_list) { cm_FreeServerList(serverspp, 0); @@ -779,7 +807,7 @@ cm_Analyze(cm_conn_t *connp, LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr); osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]", osi_LogSaveString(afsd_logp,addr)); - reqp->tokenIdleErrorServp = serverp; + reqp->errorServp = serverp; reqp->idleError++; } @@ -947,7 +975,7 @@ cm_Analyze(cm_conn_t *connp, } if (replicated && serverp) { - reqp->tokenIdleErrorServp = serverp; + reqp->errorServp = serverp; reqp->tokenError = errorCode; if (timeLeft > 2) @@ -1023,7 +1051,7 @@ cm_Analyze(cm_conn_t *connp, if (serverp) { if (reqp->flags & CM_REQ_NEW_CONN_FORCED) { - reqp->tokenIdleErrorServp = serverp; + reqp->errorServp = serverp; reqp->tokenError = errorCode; } else { reqp->flags |= CM_REQ_NEW_CONN_FORCED; @@ -1071,7 +1099,7 @@ cm_Analyze(cm_conn_t *connp, errorCode, s); if (serverp) { - reqp->tokenIdleErrorServp = serverp; + reqp->errorServp = serverp; reqp->tokenError = errorCode; retry = 1; } @@ -1084,7 +1112,7 @@ cm_Analyze(cm_conn_t *connp, * and force the use of another server. */ if (serverp) { - reqp->tokenIdleErrorServp = serverp; + reqp->errorServp = serverp; reqp->tokenError = errorCode; retry = 1; } @@ -1280,15 +1308,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_ continue; tsp = tsrp->server; - if (reqp->tokenIdleErrorServp) { + if (reqp->errorServp) { /* * search the list until we find the server * that failed last time. When we find it * clear the error, skip it and try the next one * in the list. */ - if (tsp == reqp->tokenIdleErrorServp) - reqp->tokenIdleErrorServp = NULL; + if (tsp == reqp->errorServp) + reqp->errorServp = NULL; continue; } if (tsp) { diff --git a/src/WINNT/afsd/cm_conn.h b/src/WINNT/afsd/cm_conn.h index 2c7bdfc5c3..affae47ae9 100644 --- a/src/WINNT/afsd/cm_conn.h +++ b/src/WINNT/afsd/cm_conn.h @@ -62,9 +62,10 @@ typedef struct cm_req { int rpcError; /* RPC error code */ int volumeError; /* volume error code */ int accessError; /* access error code */ - struct cm_server * tokenIdleErrorServp; /* server that reported a token/idle error other than expired */ + struct cm_server * errorServp; /* server that reported a token/idle error other than expired */ int tokenError; int idleError; + int vnovolError; afs_uint32 flags; clientchar_t * tidPathp; clientchar_t * relPathp;