From 428400fb831f7b4673973a0d2842ff536b7952c0 Mon Sep 17 00:00:00 2001 From: Andrew Deason Date: Fri, 13 Jan 2012 13:43:16 -0500 Subject: [PATCH] vol: remove SYNC fatal_error processing Currently SYNC clients will "disable" themselves on certain error patterns. For example, if the server end closes its file descriptor too many times, or takes too long and then closes the fd, the SYNC client will return an error and set fatal_error. On any subsequent SYNC requests, the request will immediately fail without contacting the server, often making SYNC client programs effectively useless until they are restarted. There isn't really any reason to cause future requests to fail. Transient problems in the fileserver can easily make this situation possible (e.g. a fileserver can crash but still take several minutes to close the SYNC fd while the core is written to disk), and so while we may return an error for a specific problematic request, future requests may be fine. So, just remove everything related to fatal_error, so future SYNC requests can continue to be attempted. Adjust some log messages to reflect the new behavior. Reviewed-on: http://gerrit.openafs.org/6548 Tested-by: BuildBot Reviewed-by: Derrick Brashear (cherry picked from commit 40bf6dee2409197f7494c3d09bf2dea7c248d185) Change-Id: I0f7a1792afd1ace3beabe238107d0a5069ccbb44 Reviewed-on: http://gerrit.openafs.org/6609 Tested-by: BuildBot Reviewed-by: Derrick Brashear --- src/vol/daemon_com.c | 12 +++--------- src/vol/daemon_com.h | 1 - src/vol/fssync-client.c | 2 +- src/vol/salvsync-client.c | 2 +- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/vol/daemon_com.c b/src/vol/daemon_com.c index 26d5cb7c8e..acb7360af1 100644 --- a/src/vol/daemon_com.c +++ b/src/vol/daemon_com.c @@ -275,16 +275,11 @@ SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res) int tries; afs_uint32 now, timeout, code=SYNC_OK; - if (state->fatal_error) { - return SYNC_COM_ERROR; - } - if (state->fd == OSI_NULLSOCKET) { SYNC_connect(state); } if (state->fd == OSI_NULLSOCKET) { - state->fatal_error = 1; return SYNC_COM_ERROR; } @@ -320,10 +315,9 @@ SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res) } if (code == SYNC_COM_ERROR) { - Log("SYNC_ask: fatal protocol error on circuit '%s'; disabling sync " - "protocol until next server restart\n", - state->proto_name); - state->fatal_error = 1; + Log("SYNC_ask: too many / too latent fatal protocol errors on circuit " + "'%s'; giving up (tries %d timeout %d)\n", + state->proto_name, tries, timeout); } return code; diff --git a/src/vol/daemon_com.h b/src/vol/daemon_com.h index 249c270da8..dabaf3b578 100644 --- a/src/vol/daemon_com.h +++ b/src/vol/daemon_com.h @@ -144,7 +144,6 @@ typedef struct SYNC_client_state { int retry_limit; /**< max number of times for SYNC_ask to retry */ afs_int32 hard_timeout; /**< upper limit on time to keep trying */ char * proto_name; /**< sync protocol associated with this conn */ - byte fatal_error; /**< nonzero if fatal error on this client conn */ afs_uint32 pkt_seq; /**< packet xmit sequence counter */ afs_uint32 com_seq; /**< command xmit sequence counter */ } SYNC_client_state; diff --git a/src/vol/fssync-client.c b/src/vol/fssync-client.c index be0a0ad631..7339de9448 100644 --- a/src/vol/fssync-client.c +++ b/src/vol/fssync-client.c @@ -132,7 +132,7 @@ FSYNC_askfs(SYNC_command * com, SYNC_response * res) break; case SYNC_COM_ERROR: case SYNC_BAD_COMMAND: - Log("FSYNC_askfs: fatal FSSYNC protocol error; volume management functionality disabled until next fileserver restart\n"); + Log("FSYNC_askfs: internal FSSYNC protocol error %d\n", code); break; case SYNC_DENIED: Log("FSYNC_askfs: FSSYNC request denied for reason=%d\n", res->hdr.reason); diff --git a/src/vol/salvsync-client.c b/src/vol/salvsync-client.c index 84d65ed438..b625c885e2 100644 --- a/src/vol/salvsync-client.c +++ b/src/vol/salvsync-client.c @@ -108,7 +108,7 @@ SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res) break; case SYNC_COM_ERROR: case SYNC_BAD_COMMAND: - Log("SALVSYNC_askSalv: fatal SALVSYNC protocol error; online salvager functionality disabled until next fileserver restart\n"); + Log("SALVSYNC_askSalv: internal SALVSYNC protocol error %d\n", code); break; case SYNC_DENIED: Log("SALVSYNC_askSalv: SALVSYNC request denied for reason=%d\n", res->hdr.reason);