diff --git a/src/tviced/serialize_state.c b/src/tviced/serialize_state.c index 6b2d881493..3fbb5f130e 100644 --- a/src/tviced/serialize_state.c +++ b/src/tviced/serialize_state.c @@ -138,13 +138,13 @@ fs_stateSave(void) * BUT, this still has one flaw -- what do we do about rx worker threads that * are blocked in the host package making an RPC call to a cm??? * - * perhaps we need a refcounter that keeps track of threads blocked in rpc calls - * with H_LOCK dropped (and the host struct likely left in an inconsistent state) - * - * or better yet, we need to associate a state machine with each host object - * (kind of like demand attach Volume structures). - * - * sigh. I suspect we'll need to revisit this issue + * currently we try to detect if a host struct is in an inconsistent state + * when we go to save it to disk, and just skip the hosts that we think may + * be inconsistent (see h_isBusy_r in host.c). This has the problem of causing + * more InitCallBackState's when we come back up, but the number of hosts in + * such a state should be small. In the future, we could try to lock hosts + * (with some deadline so we don't wait forever) before serializing, but at + * least for now it does not seem worth the trouble. */ if (fs_state.options.fs_state_verify_before_save) { diff --git a/src/viced/host.c b/src/viced/host.c index 558f7a714f..59ed73e980 100644 --- a/src/viced/host.c +++ b/src/viced/host.c @@ -2920,6 +2920,37 @@ static int h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h); static void h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out); static void h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out); +/** + * Is this host busy? + * + * This is just a hint and should not be trusted; this should probably only be + * used by the host state serialization code when trying to detect if a host + * can be sanely serialized to disk or not. If this function returns 1, the + * host may be in an invalid state and thus should not be saved to disk. + */ +static int +h_isBusy_r(struct host *host) +{ + struct Lock *hostLock = &host->lock; + int locked = 0; + + LOCK_LOCK(hostLock); + if (hostLock->excl_locked || hostLock->readers_reading) { + locked = 1; + } + LOCK_UNLOCK(hostLock); + + if (locked) { + return 1; + } + + if ((host->hostFlags & HWHO_INPROGRESS) || !(host->hostFlags & ALTADDR)) { + /* We shouldn't hit this if the host wasn't locked, but just in case... */ + return 1; + } + + return 0; +} /* this procedure saves all host state to disk for fast startup */ int @@ -3241,6 +3272,16 @@ h_stateSaveHost(struct host * host, void* rock) struct iovec iov[4]; int iovcnt = 2; + if (h_isBusy_r(host)) { + char hoststr[16]; + ViceLog(1, ("Not saving host %s:%d to disk; host appears busy\n", + afs_inet_ntoa_r(host->host, hoststr), (int)ntohs(host->port))); + /* Make sure we don't try to save callbacks to disk for this host, or + * we'll get confused on restore */ + DeleteAllCallBacks_r(host, 1); + return 0; + } + memset(&hdr, 0, sizeof(hdr)); if (state->h_hdr->index_max < host->index) {