From 51ec2670116d95bb6bdcd7871fce685fc2eaaeb0 Mon Sep 17 00:00:00 2001 From: Tom Keiser Date: Fri, 17 Mar 2006 19:54:26 +0000 Subject: [PATCH] dafs-20060317 FIXES 26648 demand attach/fast restart fileserver --- Makefile.in | 24 +- acinclude.m4 | 16 + configure.in | 1 + src/auth/Makefile.in | 2 +- src/bozo/bos.c | 271 +- src/bozo/bosserver.c | 3 +- src/bozo/fsbnodeops.c | 441 ++- src/cf/osconf.m4 | 12 + src/config/param.rs_aix51.h | 2 - src/config/param.rs_aix52.h | 2 - src/config/param.rs_aix53.h | 2 - src/config/stds.h | 25 +- src/rx/rx_queue.h | 32 + src/tsalvaged/Makefile.in | 200 ++ src/tsalvaged/salvsync-debug.c | 475 +++ src/tviced/Makefile.in | 43 +- src/tviced/NTMakefile | 2 +- src/tviced/serialize_state.c | 1120 +++++++ src/tviced/serialize_state.h | 311 ++ src/tviced/state_analyzer.c | 2004 ++++++++++++ src/tvolser/Makefile.in | 15 +- src/util/Makefile.in | 5 +- src/util/afsutil_prototypes.h | 7 + src/util/dirpath.c | 10 + src/util/dirpath.hin | 16 + src/util/dirpath_nt.h | 16 + src/util/errors.h | 1 + src/util/strnlen.c | 35 + src/viced/Makefile.in | 1 + src/viced/NTMakefile | 2 + src/viced/afsfileprocs.c | 25 +- src/viced/callback.c | 1149 ++++++- src/viced/callback.h | 158 + src/viced/host.c | 634 +++- src/viced/host.h | 24 +- src/viced/viced.c | 234 +- src/viced/viced.h | 60 +- src/viced/viced_prototypes.h | 23 + src/vol/Makefile.in | 82 +- src/vol/NTMakefile | 2 + src/vol/daemon_com.c | 473 +++ src/vol/daemon_com.h | 141 + src/vol/fssync-client.c | 222 ++ src/vol/fssync-debug.c | 1148 +++++++ src/vol/fssync-server.c | 1179 ++++++++ src/vol/fssync.c | 751 ----- src/vol/fssync.h | 137 +- src/vol/nuke.c | 1 + src/vol/partition.c | 90 +- src/vol/partition.h | 31 +- src/vol/purge.c | 19 +- src/vol/salvage.h | 5 + src/vol/salvaged.c | 738 +++++ src/vol/salvager.c | 499 +++ src/vol/salvsync-client.c | 172 ++ src/vol/salvsync-server.c | 1009 +++++++ src/vol/salvsync.h | 111 + src/vol/test/listVicepx.c | 1 + src/vol/test/updateDirInode.c | 1 + src/vol/vnode.c | 320 +- src/vol/vnode.h | 2 + src/vol/vol-salvage.c | 614 +--- src/vol/vol-salvage.h | 282 ++ src/vol/voldefs.h | 3 + src/vol/volinodes.h | 5 + src/vol/volume.c | 5191 ++++++++++++++++++++++++++++---- src/vol/volume.h | 316 +- src/volser/NTMakefile | 2 + src/volser/dumpstuff.c | 1 + src/volser/volprocs.c | 22 +- src/volser/volser.p.h | 3 +- 71 files changed, 18626 insertions(+), 2350 deletions(-) create mode 100644 src/tsalvaged/Makefile.in create mode 100644 src/tsalvaged/salvsync-debug.c create mode 100644 src/tviced/serialize_state.c create mode 100644 src/tviced/serialize_state.h create mode 100644 src/tviced/state_analyzer.c create mode 100644 src/util/strnlen.c create mode 100644 src/viced/callback.h create mode 100644 src/vol/daemon_com.c create mode 100644 src/vol/daemon_com.h create mode 100644 src/vol/fssync-client.c create mode 100644 src/vol/fssync-debug.c create mode 100644 src/vol/fssync-server.c delete mode 100644 src/vol/fssync.c create mode 100644 src/vol/salvaged.c create mode 100644 src/vol/salvager.c create mode 100644 src/vol/salvsync-client.c create mode 100644 src/vol/salvsync-server.c create mode 100644 src/vol/salvsync.h create mode 100644 src/vol/vol-salvage.h diff --git a/Makefile.in b/Makefile.in index 7e8033d60a..209d9b272b 100644 --- a/Makefile.in +++ b/Makefile.in @@ -213,6 +213,24 @@ sgiefs: vol: cmd comerr dir afs sgiefs ${COMPILE_PART1} vol ${COMPILE_PART2} +tsalvaged: vol libafsrpc libafsauthent cmd util + set -x; \ + if test "@DEMAND_ATTACH@" = "yes" ; then \ + case ${SYS_NAME} in \ + alpha_dux*|sgi_*|sun*_5*|rs_aix*|*linux*|hp_ux11*|ia64_hpux*|*fbsd*|*nbsd2*) \ + ${COMPILE_PART1} tsalvaged ${COMPILE_PART2} ;; \ + *_darwin_[1-6][0-9]) \ + echo Not building MT tsalvaged for ${SYS_NAME} ;; \ + *_darwin_*) \ + ${COMPILE_PART1} tsalvaged ${COMPILE_PART2} ;; \ + *) \ + echo Not building MT tsalvaged for ${SYS_NAME} ;; \ + esac \ + else \ + echo skipping tsalvaged ; \ + fi + + vlserver: cmd comerr vol audit vlserver_depinstall ${COMPILE_PART1} vlserver ${COMPILE_PART2} @@ -569,13 +587,13 @@ jafs: libjafs jafsadm: libjafsadm finale: project cmd comerr afsd butc tbutc @ENABLE_KERNEL_MODULE@ libuafs audit kauth log package \ - ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \ + ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \ venus update xstat afsmonitor dauth rxdebug libafsrpc \ libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages ${COMPILE_PART1} finale ${COMPILE_PART2} finale_nolibafs: project cmd comerr afsd butc tbutc libuafs audit kauth log package \ - ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \ + ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \ venus update xstat afsmonitor dauth rxdebug libafsrpc \ libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages ${COMPILE_PART1} finale ${COMPILE_PART2} @@ -633,6 +651,7 @@ clean2: -${COMPILE_PART1} tviced ${COMPILE_CLEAN} -${COMPILE_PART1} volser ${COMPILE_CLEAN} -${COMPILE_PART1} tvolser ${COMPILE_CLEAN} + -${COMPILE_PART1} tsalvaged ${COMPILE_CLEAN} -${COMPILE_PART1} venus ${COMPILE_CLEAN} -${COMPILE_PART1} venus/test ${COMPILE_CLEAN} -${COMPILE_PART1} afsd ${COMPILE_CLEAN} @@ -791,6 +810,7 @@ distclean: clean src/tests/Makefile \ src/tests/run-tests \ src/tests/OpenAFS/Dirpath.pm \ + src/tsalvaged/Makefile \ src/tsm41/Makefile \ src/tviced/Makefile \ src/tvolser/Makefile \ diff --git a/acinclude.m4 b/acinclude.m4 index c9b8417dd7..d33fec3f10 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -33,6 +33,8 @@ AC_ARG_ENABLE( fast-restart, [ --enable-fast-restart enable fast startup of file server without salvaging],, enable_fast_restart="no") AC_ARG_ENABLE( bitmap-later, [ --enable-bitmap-later enable fast startup of file server by not reading bitmap till needed],, enable_bitmap_later="no") +AC_ARG_ENABLE( demand-attach-fs, +[ --enable-demand-attach-fs enable Demand Attach Fileserver (please see documentation)],, enable_demand_attach_fs="no") AC_ARG_ENABLE( full-vos-listvol-switch, [ --disable-full-vos-listvol-switch disable vos full listvol switch for formatted output],, enable_full_vos_listvol_switch="yes") AC_ARG_WITH(dux-kernel-headers, @@ -948,6 +950,20 @@ if test "$enable_bitmap_later" = "yes"; then AC_DEFINE(BITMAP_LATER, 1, [define if you want to salvager to check bitmasks later]) fi +if test "$enable_demand_attach_fs" = "yes"; then + AC_DEFINE(DEMAND_ATTACH_ENABLE, 1, [define if you want the demand attach fileserver]) + DEMAND_ATTACH="yes" +else + DEMAND_ATTACH="no" +fi +AC_SUBST(DEMAND_ATTACH) + +if test "$enable_fast_restart" = "yes" && + test "$enable_demand_attach_fs" = "yes" ; then + AC_MSG_ERROR([The Demand Attach and Fast Restart extensions are mutually exclusive. Demand Attach fileservers automatically salvage volumes in the background, thereby making Fast Restart pointless.]) + exit 1 +fi + if test "$enable_full_vos_listvol_switch" = "yes"; then AC_DEFINE(FULL_LISTVOL_SWITCH, 1, [define if you want to want listvol switch]) fi diff --git a/configure.in b/configure.in index e96a93be9b..c20cce9f2c 100644 --- a/configure.in +++ b/configure.in @@ -106,6 +106,7 @@ src/tbutc/Makefile \ src/tests/Makefile \ src/tests/run-tests \ src/tests/OpenAFS/Dirpath.pm \ +src/tsalvaged/Makefile \ src/tsm41/Makefile \ src/tviced/Makefile \ src/tvolser/Makefile \ diff --git a/src/auth/Makefile.in b/src/auth/Makefile.in index 33797066b1..975775badb 100644 --- a/src/auth/Makefile.in +++ b/src/auth/Makefile.in @@ -96,7 +96,7 @@ test: cd test; $(MAKE) clean: - $(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core\ + $(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core \ AFS_component_version_number.c include ../config/Makefile.version diff --git a/src/bozo/bos.c b/src/bozo/bos.c index ad5a00f4f8..cca66c03a6 100644 --- a/src/bozo/bos.c +++ b/src/bozo/bos.c @@ -52,10 +52,12 @@ static DoStat(); #include "bosint.h" -#define MRAFS_OFFSET 9 -#define ADDPARMOFFSET 26 +/* command offsets for bos salvage command */ +#define MRAFS_OFFSET 10 +#define ADDPARMOFFSET 27 -static struct SalvageParms { +/* MR-AFS salvage parameters */ +struct MRAFSSalvageParms { afs_int32 Optdebug; afs_int32 Optnowrite; afs_int32 Optforce; @@ -74,7 +76,7 @@ static struct SalvageParms { afs_int32 OptLogLevel; afs_int32 OptRxDebug; afs_uint32 OptResidencies; -} mrafsParm; +}; /* dummy routine for the audit work. It should do nothing since audits */ /* occur at the server level and bos is not a server. */ @@ -1224,17 +1226,11 @@ StopServer(as) #define PARMBUFFERSSIZE 32 -static -DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir, - orphans) - struct rx_connection *aconn; - char *aoutName; - char *aparm1; - char *aparm2; - afs_int32 showlog; - char *parallel; - char *atmpDir; - char *orphans; +static afs_int32 +DoSalvage(struct rx_connection * aconn, char * aparm1, char * aparm2, + char * aoutName, afs_int32 showlog, char * parallel, + char * atmpDir, char * orphans, int dafs, + struct MRAFSSalvageParms * mrafsParm) { register afs_int32 code; char *parms[6]; @@ -1285,19 +1281,43 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir, parms[code] = ""; if (!aparm2) aparm2 = ""; + /* MUST pass canonical (wire-format) salvager path to bosserver */ - strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE); if (*aparm2 != 0) { - if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) + - 1) > BOZO_BSSIZE) { - printf("bos: command line too big\n"); - return (E2BIG); + /* single volume salvage */ + if (dafs) { + /* for DAFS, we call the salvagserver binary with special options. + * in this mode, it simply uses SALVSYNC to tell the currently + * running salvageserver to offline and salvage the volume in question */ + strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH, BOZO_BSSIZE); + + if ((strlen(tbuffer) + 9 + strlen(partName) + 1 + strlen(aparm2) + + 1) > BOZO_BSSIZE) { + printf("bos: command line too big\n"); + return (E2BIG); + } + + strcat(tbuffer, " -client "); + strcat(tbuffer, partName); + strcat(tbuffer, " "); + strcat(tbuffer, aparm2); + } else { + strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE); + + if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) + + 1) > BOZO_BSSIZE) { + printf("bos: command line too big\n"); + return (E2BIG); + } + + strcat(tbuffer, " "); + strcat(tbuffer, partName); + strcat(tbuffer, " "); + strcat(tbuffer, aparm2); } - strcat(tbuffer, " "); - strcat(tbuffer, partName); - strcat(tbuffer, " "); - strcat(tbuffer, aparm2); } else { + /* partition salvage */ + strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE); if ((strlen(tbuffer) + 4 + strlen(partName) + 1) > BOZO_BSSIZE) { printf("bos: command line too big\n"); return (E2BIG); @@ -1306,75 +1326,82 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir, strcat(tbuffer, partName); } - /* add the parallel option if given */ - if (parallel != NULL) { - if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) { - printf("bos: command line too big\n"); - return (E2BIG); + /* For DAFS, specifying a single volume does not result in a standard + * salvager call. Instead, it simply results in a SALVSYNC call to the + * online salvager daemon. This interface does not give us the same rich + * set of call flags. Thus, we skip these steps for DAFS single-volume + * calls */ + if (!dafs || (*aparm2 == 0)) { + /* add the parallel option if given */ + if (parallel != NULL) { + if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) { + printf("bos: command line too big\n"); + return (E2BIG); + } + strcat(tbuffer, " -parallel "); + strcat(tbuffer, parallel); } - strcat(tbuffer, " -parallel "); - strcat(tbuffer, parallel); - } - /* add the tmpdir option if given */ - if (atmpDir != NULL) { - if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) { - printf("bos: command line too big\n"); - return (E2BIG); + /* add the tmpdir option if given */ + if (atmpDir != NULL) { + if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) { + printf("bos: command line too big\n"); + return (E2BIG); + } + strcat(tbuffer, " -tmpdir "); + strcat(tbuffer, atmpDir); } - strcat(tbuffer, " -tmpdir "); - strcat(tbuffer, atmpDir); - } - /* add the orphans option if given */ - if (orphans != NULL) { - if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) { - printf("bos: command line too big\n"); - return (E2BIG); + /* add the orphans option if given */ + if (orphans != NULL) { + if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) { + printf("bos: command line too big\n"); + return (E2BIG); + } + strcat(tbuffer, " -orphans "); + strcat(tbuffer, orphans); } - strcat(tbuffer, " -orphans "); - strcat(tbuffer, orphans); - } - if (mrafsParm.Optdebug) - strcat(tbuffer, " -debug"); - if (mrafsParm.Optnowrite) - strcat(tbuffer, " -nowrite"); - if (mrafsParm.Optforce) - strcat(tbuffer, " -force"); - if (mrafsParm.Optoktozap) - strcat(tbuffer, " -oktozap"); - if (mrafsParm.Optrootfiles) - strcat(tbuffer, " -rootfiles"); - if (mrafsParm.Optsalvagedirs) - strcat(tbuffer, " -salvagedirs"); - if (mrafsParm.Optblockreads) - strcat(tbuffer, " -blockreads"); - if (mrafsParm.OptListResidencies) - strcat(tbuffer, " -ListResidencies"); - if (mrafsParm.OptSalvageRemote) - strcat(tbuffer, " -SalvageRemote"); - if (mrafsParm.OptSalvageArchival) - strcat(tbuffer, " -SalvageArchival"); - if (mrafsParm.OptIgnoreCheck) - strcat(tbuffer, " -IgnoreCheck"); - if (mrafsParm.OptForceOnLine) - strcat(tbuffer, " -ForceOnLine"); - if (mrafsParm.OptUseRootDirACL) - strcat(tbuffer, " -UseRootDirACL"); - if (mrafsParm.OptTraceBadLinkCounts) - strcat(tbuffer, " -TraceBadLinkCounts"); - if (mrafsParm.OptDontAskFS) - strcat(tbuffer, " -DontAskFS"); - if (mrafsParm.OptLogLevel) { - sprintf(pbuffer, " -LogLevel %ld", mrafsParm.OptLogLevel); - strcat(tbuffer, pbuffer); - } - if (mrafsParm.OptRxDebug) - strcat(tbuffer, " -rxdebug"); - if (mrafsParm.OptResidencies) { - sprintf(pbuffer, " -Residencies %lu", mrafsParm.OptResidencies); - strcat(tbuffer, pbuffer); + if (mrafsParm->Optdebug) + strcat(tbuffer, " -debug"); + if (mrafsParm->Optnowrite) + strcat(tbuffer, " -nowrite"); + if (mrafsParm->Optforce) + strcat(tbuffer, " -force"); + if (mrafsParm->Optoktozap) + strcat(tbuffer, " -oktozap"); + if (mrafsParm->Optrootfiles) + strcat(tbuffer, " -rootfiles"); + if (mrafsParm->Optsalvagedirs) + strcat(tbuffer, " -salvagedirs"); + if (mrafsParm->Optblockreads) + strcat(tbuffer, " -blockreads"); + if (mrafsParm->OptListResidencies) + strcat(tbuffer, " -ListResidencies"); + if (mrafsParm->OptSalvageRemote) + strcat(tbuffer, " -SalvageRemote"); + if (mrafsParm->OptSalvageArchival) + strcat(tbuffer, " -SalvageArchival"); + if (mrafsParm->OptIgnoreCheck) + strcat(tbuffer, " -IgnoreCheck"); + if (mrafsParm->OptForceOnLine) + strcat(tbuffer, " -ForceOnLine"); + if (mrafsParm->OptUseRootDirACL) + strcat(tbuffer, " -UseRootDirACL"); + if (mrafsParm->OptTraceBadLinkCounts) + strcat(tbuffer, " -TraceBadLinkCounts"); + if (mrafsParm->OptDontAskFS) + strcat(tbuffer, " -DontAskFS"); + if (mrafsParm->OptLogLevel) { + sprintf(pbuffer, " -LogLevel %ld", mrafsParm->OptLogLevel); + strcat(tbuffer, pbuffer); + } + if (mrafsParm->OptRxDebug) + strcat(tbuffer, " -rxdebug"); + if (mrafsParm->OptResidencies) { + sprintf(pbuffer, " -Residencies %lu", mrafsParm->OptResidencies); + strcat(tbuffer, pbuffer); + } } parms[0] = tbuffer; @@ -1481,22 +1508,36 @@ SalvageCmd(as) char tname[BOZO_BSSIZE]; afs_int32 newID; extern struct ubik_client *cstruct; - afs_int32 curGoal, showlog = 0, mrafs = 0; + afs_int32 curGoal, showlog = 0, dafs = 0, mrafs = 0; char *parallel; char *tmpDir; char *orphans; char *tp; + char * serviceName; + struct MRAFSSalvageParms mrafsParm; memset(&mrafsParm, 0, sizeof(mrafsParm)); /* parm 0 is machine name, 1 is partition, 2 is volume, 3 is -all flag */ tconn = GetConn(as, 0); - /* Find out whether fileserver is running MR-AFS (has a scanner instance) */ - /* XXX this should really be done some other way, potentially by RPC */ tp = &tname[0]; - if (code = BOZO_GetInstanceParm(tconn, "fs", 3, &tp) == 0) - mrafs = 1; + + /* find out whether fileserver is running demand attach fs */ + if (code = BOZO_GetInstanceParm(tconn, "dafs", 0, &tp) == 0) { + dafs = 1; + serviceName = "dafs"; + /* Find out whether fileserver is running MR-AFS (has a scanner instance) */ + /* XXX this should really be done some other way, potentially by RPC */ + if (code = BOZO_GetInstanceParm(tconn, serviceName, 4, &tp) == 0) + mrafs = 1; + } else { + serviceName = "fs"; + /* Find out whether fileserver is running MR-AFS (has a scanner instance) */ + /* XXX this should really be done some other way, potentially by RPC */ + if (code = BOZO_GetInstanceParm(tconn, serviceName, 3, &tp) == 0) + mrafs = 1; + } /* we can do a volume, a partition or the whole thing, but not mixtures * thereof */ @@ -1542,6 +1583,14 @@ SalvageCmd(as) orphans = as->parms[8].items->data; } + if (dafs) { + if (!as->parms[9].items) { /* -forceDAFS flag */ + printf("This is a demand attach fileserver. Are you sure you want to proceed with a manual salvage?\n"); + printf("must specify -forceDAFS flag in order to proceed.\n"); + return EINVAL; + } + } + if (mrafs) { if (as->parms[MRAFS_OFFSET].items) mrafsParm.Optdebug = 1; @@ -1597,7 +1646,7 @@ SalvageCmd(as) } else { int stop = 0; - for (i = 9; i < ADDPARMOFFSET; i++) { + for (i = MRAFS_OFFSET; i < ADDPARMOFFSET; i++) { if (as->parms[i].items) { printf(" %s only possible for MR-AFS fileserver.\n", as->parms[i].name); @@ -1610,12 +1659,12 @@ SalvageCmd(as) if (as->parms[4].items) { /* salvage whole enchilada */ - curGoal = GetServerGoal(tconn, "fs"); + curGoal = GetServerGoal(tconn, serviceName); if (curGoal == BSTAT_NORMAL) { - printf("bos: shutting down fs.\n"); - code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN); + printf("bos: shutting down '%s'.\n", serviceName); + code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN); if (code) { - printf("bos: failed to stop 'fs' (%s)\n", em(code)); + printf("bos: failed to stop '%s' (%s)\n", serviceName, em(code)); return code; } code = BOZO_WaitAll(tconn); /* wait for shutdown to complete */ @@ -1626,12 +1675,12 @@ SalvageCmd(as) /* now do the salvage operation */ printf("Starting salvage.\n"); rc = DoSalvage(tconn, NULL, NULL, outName, showlog, parallel, tmpDir, - orphans); + orphans, dafs, &mrafsParm); if (curGoal == BSTAT_NORMAL) { - printf("bos: restarting fs.\n"); - code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL); + printf("bos: restarting %s.\n", serviceName); + code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL); if (code) { - printf("bos: failed to restart 'fs' (%s)\n", em(code)); + printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code)); return code; } } @@ -1651,13 +1700,13 @@ SalvageCmd(as) as->parms[1].items->data); return -1; } - curGoal = GetServerGoal(tconn, "fs"); + curGoal = GetServerGoal(tconn, serviceName); /* salvage a whole partition (specified by parms[1]) */ if (curGoal == BSTAT_NORMAL) { - printf("bos: shutting down fs.\n"); - code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN); + printf("bos: shutting down '%s'.\n", serviceName); + code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN); if (code) { - printf("bos: can't stop 'fs' (%s)\n", em(code)); + printf("bos: can't stop '%s' (%s)\n", serviceName, em(code)); return code; } code = BOZO_WaitAll(tconn); /* wait for shutdown to complete */ @@ -1668,12 +1717,12 @@ SalvageCmd(as) /* now do the salvage operation */ printf("Starting salvage.\n"); rc = DoSalvage(tconn, as->parms[1].items->data, NULL, outName, - showlog, parallel, tmpDir, orphans); + showlog, parallel, tmpDir, orphans, dafs, &mrafsParm); if (curGoal == BSTAT_NORMAL) { - printf("bos: restarting fs.\n"); - code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL); + printf("bos: restarting '%s'.\n", serviceName); + code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL); if (code) { - printf("bos: failed to restart 'fs' (%s)\n", em(code)); + printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code)); return code; } } @@ -1723,7 +1772,7 @@ SalvageCmd(as) } printf("Starting salvage.\n"); rc = DoSalvage(tconn, as->parms[1].items->data, tname, outName, - showlog, parallel, tmpDir, orphans); + showlog, parallel, tmpDir, orphans, dafs, &mrafsParm); if (rc) return rc; } @@ -2153,6 +2202,8 @@ main(argc, argv) "directory to place tmp files"); cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL, "ignore | remove | attach"); + cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL, + "(DAFS) force salvage of demand attach fileserver"); cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL, "(MR-AFS) Run in Debugging mode"); cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL, diff --git a/src/bozo/bosserver.c b/src/bozo/bosserver.c index 635a6810e4..2351eeb066 100644 --- a/src/bozo/bosserver.c +++ b/src/bozo/bosserver.c @@ -51,7 +51,7 @@ RCSID #define BOZO_LWP_STACKSIZE 16000 extern int BOZO_ExecuteRequest(); extern int RXSTATS_ExecuteRequest(); -extern struct bnode_ops fsbnode_ops, ezbnode_ops, cronbnode_ops; +extern struct bnode_ops fsbnode_ops, dafsbnode_ops, ezbnode_ops, cronbnode_ops; void bozo_Log(); @@ -895,6 +895,7 @@ main(int argc, char **argv, char **envp) } bnode_Register("fs", &fsbnode_ops, 3); + bnode_Register("dafs", &dafsbnode_ops, 4); bnode_Register("simple", &ezbnode_ops, 1); bnode_Register("cron", &cronbnode_ops, 2); diff --git a/src/bozo/fsbnodeops.c b/src/bozo/fsbnodeops.c index 2ac65e4621..e38670e80e 100644 --- a/src/bozo/fsbnodeops.c +++ b/src/bozo/fsbnodeops.c @@ -41,13 +41,6 @@ RCSID #include #include "bnode.h" -static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete(); -static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp(); -static int fs_hascore(); -struct bnode *fs_create(); - -static SetNeedsClock(); -static NudgeProcs(); static int emergency = 0; @@ -76,6 +69,77 @@ static int emergency = 0; The needsSalvage flag is cleared when the salvager exits. */ +struct fsbnode { + struct bnode b; + afs_int32 timeSDStarted; /* time shutdown operation started */ + char *filecmd; /* command to start primary file server */ + char *volcmd; /* command to start secondary vol server */ + char *salsrvcmd; /* command to start salvageserver (demand attach fs) */ + char *salcmd; /* command to start salvager */ + char *scancmd; /* command to start scanner (MR-AFS) */ + struct bnode_proc *fileProc; /* process for file server */ + struct bnode_proc *volProc; /* process for vol server */ + struct bnode_proc *salsrvProc; /* process for salvageserver (demand attach fs) */ + struct bnode_proc *salProc; /* process for salvager */ + struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */ + afs_int32 lastFileStart; /* last start for file */ + afs_int32 lastVolStart; /* last start for vol */ + afs_int32 lastSalsrvStart; /* last start for salvageserver (demand attach fs) */ + afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */ + char fileRunning; /* file process is running */ + char volRunning; /* volser is running */ + char salsrvRunning; /* salvageserver is running (demand attach fs) */ + char salRunning; /* salvager is running */ + char scanRunning; /* scanner is running (MR_AFS) */ + char fileSDW; /* file shutdown wait */ + char volSDW; /* vol shutdown wait */ + char salsrvSDW; /* salvageserver shutdown wait (demand attach fs) */ + char salSDW; /* waiting for the salvager to shutdown */ + char scanSDW; /* scanner shutdown wait (MR_AFS) */ + char fileKillSent; /* kill signal has been sent */ + char volKillSent; + char salsrvKillSent; /* kill signal has been sent (demand attach fs) */ + char salKillSent; + char scanKillSent; /* kill signal has been sent (MR_AFS) */ + char needsSalvage; /* salvage before running */ + char needsClock; /* do we need clock ticks */ +}; + + + +struct bnode * fs_create(char *ainstance, char *afilecmd, char *avolcmd, + char *asalcmd, char *ascancmd); +struct bnode * dafs_create(char *ainstance, char *afilecmd, char *avolcmd, + char * asalsrvcmd, char *asalcmd, char *ascancmd); + +static int fs_hascore(register struct ezbnode *abnode); +static int fs_restartp(register struct fsbnode *abnode); +static int SetSalFlag(register struct fsbnode *abnode, register int aflag); +static int RestoreSalFlag(register struct fsbnode *abnode); +static int fs_delete(struct fsbnode *abnode); +static int fs_timeout(struct fsbnode *abnode); +static int fs_getstat(struct fsbnode *abnode, afs_int32 * astatus); +static int fs_setstat(register struct fsbnode *abnode, afs_int32 astatus); +static int fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc); +static int fs_getstring(struct fsbnode *abnode, char *abuffer, afs_int32 alen); + + +static int fs_getparm(struct fsbnode *abnode, afs_int32 aindex, + char *abuffer, afs_int32 alen); +static int dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, + char *abuffer, afs_int32 alen); + +#ifdef AFS_NT40_ENV +static void AppendExecutableExtension(char *cmd) +#else +#define AppendExecutableExtension(x) +#endif + +static void SetNeedsClock(register struct fsbnode *ab); +static int NudgeProcs(register struct fsbnode *abnode); + + + struct bnode_ops fsbnode_ops = { fs_create, fs_timeout, @@ -89,36 +153,21 @@ struct bnode_ops fsbnode_ops = { fs_hascore, }; -struct fsbnode { - struct bnode b; - afs_int32 timeSDStarted; /* time shutdown operation started */ - char *filecmd; /* command to start primary file server */ - char *volcmd; /* command to start secondary vol server */ - char *salcmd; /* command to start salvager */ - char *scancmd; /* command to start scanner (MR-AFS) */ - struct bnode_proc *fileProc; /* process for file server */ - struct bnode_proc *volProc; /* process for vol server */ - struct bnode_proc *salProc; /* process for salvager */ - struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */ - afs_int32 lastFileStart; /* last start for file */ - afs_int32 lastVolStart; /* last start for vol */ - afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */ - char fileRunning; /* file process is running */ - char volRunning; /* volser is running */ - char salRunning; /* salvager is running */ - char scanRunning; /* scanner is running (MR_AFS) */ - char fileSDW; /* file shutdown wait */ - char volSDW; /* vol shutdown wait */ - char salSDW; /* waiting for the salvager to shutdown */ - char scanSDW; /* scanner shutdown wait (MR_AFS) */ - char fileKillSent; /* kill signal has been sent */ - char volKillSent; - char salKillSent; - char scanKillSent; /* kill signal has been sent (MR_AFS) */ - char needsSalvage; /* salvage before running */ - char needsClock; /* do we need clock ticks */ +/* demand attach fs bnode ops */ +struct bnode_ops dafsbnode_ops = { + dafs_create, + fs_timeout, + fs_getstat, + fs_setstat, + fs_delete, + fs_procexit, + fs_getstring, + dafs_getparm, + fs_restartp, + fs_hascore, }; + /* Function to tell whether this bnode has a core file or not. You might * think that this could be in bnode.c, and decide what core files to check * for based on the bnode's coreName property, but that doesn't work because @@ -140,6 +189,11 @@ fs_hascore(register struct ezbnode *abnode) if (access(tbuffer, 0) == 0) return 1; + /* see if salvageserver left a core file */ + bnode_CoreName(abnode, "salsrv", tbuffer); + if (access(tbuffer, 0) == 0) + return 1; + /* see if salvager left a core file */ bnode_CoreName(abnode, "salv", tbuffer); if (access(tbuffer, 0) == 0) @@ -198,6 +252,25 @@ fs_restartp(register struct fsbnode *abnode) if (code) return code; + if (abnode->salsrvcmd) { /* only in demand attach fs */ + /* now do same for salsrvcmd (demand attach fs) */ + code = bnode_ParseLine(abnode->salsrvcmd, &tt); + if (code) + return 0; + if (!tt) + return 0; + code = stat(tt->key, &tstat); + if (code) { + bnode_FreeTokens(tt); + return 0; + } + if (tstat.st_ctime > abnode->lastScanStart) + code = 1; + else + code = 0; + bnode_FreeTokens(tt); + } + if (abnode->scancmd) { /* Only in MR-AFS */ /* now do same for scancmd (MR-AFS) */ code = bnode_ParseLine(abnode->scancmd, &tt); @@ -228,14 +301,17 @@ SetSalFlag(register struct fsbnode *abnode, register int aflag) char tbuffer[AFSDIR_PATH_MAX]; int fd; - abnode->needsSalvage = aflag; - strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", - SALFILE, abnode->b.name, NULL); - if (aflag) { - fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666); - close(fd); - } else { - unlink(tbuffer); + /* don't use the salvage flag for demand attach fs */ + if (abnode->salsrvcmd == NULL) { + abnode->needsSalvage = aflag; + strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", + SALFILE, abnode->b.name, NULL); + if (aflag) { + fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666); + close(fd); + } else { + unlink(tbuffer); + } } return 0; } @@ -246,13 +322,18 @@ RestoreSalFlag(register struct fsbnode *abnode) { char tbuffer[AFSDIR_PATH_MAX]; - strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", - SALFILE, abnode->b.name, NULL); - if (access(tbuffer, 0) == 0) { - /* file exists, so need to salvage */ - abnode->needsSalvage = 1; - } else { + /* never set needs salvage flag for demand attach fs */ + if (abnode->salsrvcmd != NULL) { abnode->needsSalvage = 0; + } else { + strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", + SALFILE, abnode->b.name, NULL); + if (access(tbuffer, 0) == 0) { + /* file exists, so need to salvage */ + abnode->needsSalvage = 1; + } else { + abnode->needsSalvage = 0; + } } return 0; } @@ -272,6 +353,8 @@ fs_delete(struct fsbnode *abnode) free(abnode->filecmd); free(abnode->volcmd); free(abnode->salcmd); + if (abnode->salsrvcmd) + free(abnode->salsrvcmd); if (abnode->scancmd) free(abnode->scancmd); free(abnode); @@ -304,95 +387,235 @@ fs_create(char *ainstance, char *afilecmd, char *avolcmd, char *asalcmd, char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath; int bailout = 0; - fileCmdpath = volCmdpath = salCmdpath = NULL; + te = fileCmdpath = volCmdpath = salCmdpath = scanCmdpath = NULL; /* construct local paths from canonical (wire-format) paths */ if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) { bozo_Log("BNODE: command path invalid '%s'\n", afilecmd); bailout = 1; + goto done; } if (ConstructLocalBinPath(avolcmd, &volCmdpath)) { bozo_Log("BNODE: command path invalid '%s'\n", avolcmd); bailout = 1; + goto done; } if (ConstructLocalBinPath(asalcmd, &salCmdpath)) { bozo_Log("BNODE: command path invalid '%s'\n", asalcmd); bailout = 1; + goto done; } if (ascancmd && strlen(ascancmd)) { if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) { bozo_Log("BNODE: command path invalid '%s'\n", ascancmd); bailout = 1; + goto done; } } if (!bailout) { sscanf(fileCmdpath, "%s", cmdname); -#ifdef AFS_NT40_ENV AppendExecutableExtension(cmdname); -#endif if (stat(cmdname, &tstat)) { bozo_Log("BNODE: file server binary '%s' not found\n", cmdname); bailout = 1; + goto done; } sscanf(volCmdpath, "%s", cmdname); -#ifdef AFS_NT40_ENV AppendExecutableExtension(cmdname); -#endif if (stat(cmdname, &tstat)) { bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname); bailout = 1; + goto done; } sscanf(salCmdpath, "%s", cmdname); -#ifdef AFS_NT40_ENV AppendExecutableExtension(cmdname); -#endif if (stat(cmdname, &tstat)) { bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname); bailout = 1; + goto done; } if (ascancmd && strlen(ascancmd)) { sscanf(scanCmdpath, "%s", cmdname); -#ifdef AFS_NT40_ENV AppendExecutableExtension(cmdname); -#endif if (stat(cmdname, &tstat)) { bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname); bailout = 1; + goto done; } } } - if (bailout) { - free(fileCmdpath); - free(volCmdpath); - free(salCmdpath); - return NULL; - } - te = (struct fsbnode *)malloc(sizeof(struct fsbnode)); + if (te == NULL) { + bailout = 1; + goto done; + } memset(te, 0, sizeof(struct fsbnode)); te->filecmd = fileCmdpath; te->volcmd = volCmdpath; + te->salsrvcmd = NULL; te->salcmd = salCmdpath; if (ascancmd && strlen(ascancmd)) te->scancmd = scanCmdpath; else te->scancmd = NULL; if (bnode_InitBnode(te, &fsbnode_ops, ainstance) != 0) { - free(te); - free(fileCmdpath); - free(volCmdpath); - free(salCmdpath); - return NULL; + bailout = 1; + goto done; } bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */ RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */ SetNeedsClock(te); /* compute needsClock field */ + + done: + if (bailout) { + if (te) + free(te); + if (fileCmdpath) + free(fileCmdpath); + if (volCmdpath) + free(volCmdpath); + if (salCmdpath) + free(salCmdpath); + if (scanCmdpath) + free(scanCmdpath); + return NULL; + } + + return (struct bnode *)te; +} + +/* create a demand attach fs bnode */ +struct bnode * +dafs_create(char *ainstance, char *afilecmd, char *avolcmd, + char * asalsrvcmd, char *asalcmd, char *ascancmd) +{ + struct stat tstat; + register struct fsbnode *te; + char cmdname[AFSDIR_PATH_MAX]; + char *fileCmdpath, *volCmdpath, *salsrvCmdpath, *salCmdpath, *scanCmdpath; + int bailout = 0; + + te = fileCmdpath = volCmdpath = salsrvCmdpath = salCmdpath = scanCmdpath = NULL; + + /* construct local paths from canonical (wire-format) paths */ + if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) { + bozo_Log("BNODE: command path invalid '%s'\n", afilecmd); + bailout = 1; + goto done; + } + if (ConstructLocalBinPath(avolcmd, &volCmdpath)) { + bozo_Log("BNODE: command path invalid '%s'\n", avolcmd); + bailout = 1; + goto done; + } + if (ConstructLocalBinPath(asalsrvcmd, &salsrvCmdpath)) { + bozo_Log("BNODE: command path invalid '%s'\n", asalsrvcmd); + bailout = 1; + goto done; + } + if (ConstructLocalBinPath(asalcmd, &salCmdpath)) { + bozo_Log("BNODE: command path invalid '%s'\n", asalcmd); + bailout = 1; + goto done; + } + + if (ascancmd && strlen(ascancmd)) { + if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) { + bozo_Log("BNODE: command path invalid '%s'\n", ascancmd); + bailout = 1; + goto done; + } + } + + if (!bailout) { + sscanf(fileCmdpath, "%s", cmdname); + AppendExecutableExtension(cmdname); + if (stat(cmdname, &tstat)) { + bozo_Log("BNODE: file server binary '%s' not found\n", cmdname); + bailout = 1; + goto done; + } + + sscanf(volCmdpath, "%s", cmdname); + AppendExecutableExtension(cmdname); + if (stat(cmdname, &tstat)) { + bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname); + bailout = 1; + goto done; + } + + sscanf(salsrvCmdpath, "%s", cmdname); + AppendExecutableExtension(cmdname); + if (stat(cmdname, &tstat)) { + bozo_Log("BNODE: salvageserver binary '%s' not found\n", cmdname); + bailout = 1; + goto done; + } + + sscanf(salCmdpath, "%s", cmdname); + AppendExecutableExtension(cmdname); + if (stat(cmdname, &tstat)) { + bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname); + bailout = 1; + goto done; + } + + if (ascancmd && strlen(ascancmd)) { + sscanf(scanCmdpath, "%s", cmdname); + AppendExecutableExtension(cmdname); + if (stat(cmdname, &tstat)) { + bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname); + bailout = 1; + goto done; + } + } + } + + te = (struct fsbnode *)malloc(sizeof(struct fsbnode)); + if (te == NULL) { + bailout = 1; + goto done; + } + memset(te, 0, sizeof(struct fsbnode)); + te->filecmd = fileCmdpath; + te->volcmd = volCmdpath; + te->salsrvcmd = salsrvCmdpath; + te->salcmd = salCmdpath; + if (ascancmd && strlen(ascancmd)) + te->scancmd = scanCmdpath; + else + te->scancmd = NULL; + if (bnode_InitBnode(te, &dafsbnode_ops, ainstance) != 0) { + bailout = 1; + goto done; + } + bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */ + RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */ + SetNeedsClock(te); /* compute needsClock field */ + + done: + if (bailout) { + if (te) + free(te); + if (fileCmdpath) + free(fileCmdpath); + if (volCmdpath) + free(volCmdpath); + if (salsrvCmdpath) + free(salsrvCmdpath); + if (salCmdpath) + free(salCmdpath); + if (scanCmdpath) + free(scanCmdpath); + return NULL; + } + return (struct bnode *)te; } @@ -431,6 +654,15 @@ fs_timeout(struct fsbnode *abnode) FSSDTIME); } } + if (abnode->salsrvSDW) { + if (!abnode->salsrvKillSent && now - abnode->timeSDStarted > SDTIME) { + bnode_StopProc(abnode->salsrvProc, SIGKILL); + abnode->salsrvKillSent = 1; + bozo_Log + ("bos shutdown: salvageserver failed to shutdown within %d seconds\n", + SDTIME); + } + } if (abnode->scanSDW) { if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) { bnode_StopProc(abnode->scanProc, SIGKILL); @@ -449,15 +681,17 @@ fs_getstat(struct fsbnode *abnode, afs_int32 * astatus) { register afs_int32 temp; if (abnode->volSDW || abnode->fileSDW || abnode->salSDW - || abnode->scanSDW) + || abnode->scanSDW || abnode->salsrvSDW) temp = BSTAT_SHUTTINGDOWN; else if (abnode->salRunning) temp = BSTAT_NORMAL; else if (abnode->volRunning && abnode->fileRunning - && (!abnode->scancmd || abnode->scanRunning)) + && (!abnode->scancmd || abnode->scanRunning) + && (!abnode->salsrvcmd || abnode->salsrvRunning)) temp = BSTAT_NORMAL; else if (!abnode->salRunning && !abnode->volRunning - && !abnode->fileRunning && !abnode->scanRunning) + && !abnode->fileRunning && !abnode->scanRunning + && !abnode->salsrvRunning) temp = BSTAT_SHUTDOWN; else temp = BSTAT_STARTINGUP; @@ -508,6 +742,11 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc) abnode->scanRunning = 0; abnode->scanSDW = 0; abnode->scanKillSent = 0; + } else if (aproc == abnode->salsrvProc) { + abnode->salsrvProc = 0; + abnode->salsrvRunning = 0; + abnode->salsrvSDW = 0; + abnode->salsrvKillSent = 0; } /* now restart anyone who needs to restart */ @@ -515,14 +754,15 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc) } /* make sure we're periodically checking the state if we need to */ -static int +static void SetNeedsClock(register struct fsbnode *ab) { if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning - && (!ab->scancmd || ab->scanRunning)) + && (!ab->scancmd || ab->scanRunning) + && (!ab->salsrvcmd || ab->salsrvRunning)) ab->needsClock = 0; /* running normally */ else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning - && !ab->salRunning && !ab->scanRunning) + && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning) ab->needsClock = 0; /* halted normally */ else ab->needsClock = 1; /* other */ @@ -562,6 +802,18 @@ NudgeProcs(register struct fsbnode *abnode) abnode->volRunning = 1; } } + if (abnode->salsrvcmd) { + if (!abnode->salsrvRunning) { + abnode->lastSalsrvStart = FT_ApproxTime(); + code = + bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv", + &tp); + if (code == 0) { + abnode->salsrvProc = tp; + abnode->salsrvRunning = 1; + } + } + } if (abnode->scancmd) { if (!abnode->scanRunning) { abnode->lastScanStart = FT_ApproxTime(); @@ -576,7 +828,8 @@ NudgeProcs(register struct fsbnode *abnode) } } else { /* file is not running */ /* see how to start */ - if (!abnode->needsSalvage) { + /* for demand attach fs, needsSalvage flag is ignored */ + if (!abnode->needsSalvage || abnode->salsrvcmd) { /* no crash apparent, just start up normally */ if (!abnode->fileRunning) { abnode->lastFileStart = FT_ApproxTime(); @@ -596,6 +849,16 @@ NudgeProcs(register struct fsbnode *abnode) abnode->volRunning = 1; } } + if (abnode->salsrvcmd && !abnode->salsrvRunning) { + abnode->lastSalsrvStart = FT_ApproxTime(); + code = + bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv", + &tp); + if (code == 0) { + abnode->salsrvProc = tp; + abnode->salsrvRunning = 1; + } + } if (abnode->scancmd && !abnode->scanRunning) { abnode->lastScanStart = FT_ApproxTime(); code = @@ -656,6 +919,11 @@ NudgeProcs(register struct fsbnode *abnode) abnode->volSDW = 1; abnode->timeSDStarted = now; } + if (abnode->salsrvRunning && !abnode->salsrvSDW) { + bnode_StopProc(abnode->salsrvProc, SIGTERM); + abnode->salsrvSDW = 1; + abnode->timeSDStarted = now; + } if (abnode->scanRunning && !abnode->scanSDW) { bnode_StopProc(abnode->scanProc, SIGTERM); abnode->scanSDW = 1; @@ -724,3 +992,22 @@ fs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer, return BZDOM; return 0; } + +static int +dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer, + afs_int32 alen) +{ + if (aindex == 0) + strcpy(abuffer, abnode->filecmd); + else if (aindex == 1) + strcpy(abuffer, abnode->volcmd); + else if (aindex == 2) + strcpy(abuffer, abnode->salsrvcmd); + else if (aindex == 3) + strcpy(abuffer, abnode->salcmd); + else if (aindex == 4 && abnode->scancmd) + strcpy(abuffer, abnode->scancmd); + else + return BZDOM; + return 0; +} diff --git a/src/cf/osconf.m4 b/src/cf/osconf.m4 index 9fe6161d8b..22daf81e3e 100644 --- a/src/cf/osconf.m4 +++ b/src/cf/osconf.m4 @@ -971,6 +971,18 @@ case $AFS_SYSNAME in ;; esac + + +dnl pthreads fixes +case $AFS_SYSNAME in +dnl we'll go ahead and turn on XOPEN2K and ISO_C99 +dnl if this causes problems, we should scale back to _XOPEN_SOURCE=500 + *linux*) + MT_CFLAGS="${MT_CFLAGS} -D_XOPEN_SOURCE=600 -D_BSD_SOURCE" + ;; +esac + + dnl Disable the default for debugging/optimization if not enabled if test "x$enable_debug_kernel" = "xno"; then KERN_DBG= diff --git a/src/config/param.rs_aix51.h b/src/config/param.rs_aix51.h index ecfe978c4e..cd49793bae 100644 --- a/src/config/param.rs_aix51.h +++ b/src/config/param.rs_aix51.h @@ -25,8 +25,6 @@ #ifdef AFS_NAMEI_ENV #define AFS_64BIT_IOPS_ENV 1 #endif -#define BITMAP_LATER 1 -#define FAST_RESTART 1 #define AFS_HAVE_FLOCK_SYSID 1 diff --git a/src/config/param.rs_aix52.h b/src/config/param.rs_aix52.h index 0ee9986ec9..b20bb378dc 100644 --- a/src/config/param.rs_aix52.h +++ b/src/config/param.rs_aix52.h @@ -26,8 +26,6 @@ #ifdef AFS_NAMEI_ENV #define AFS_64BIT_IOPS_ENV 1 #endif -#define BITMAP_LATER 1 -#define FAST_RESTART 1 #define AFS_HAVE_FLOCK_SYSID 1 diff --git a/src/config/param.rs_aix53.h b/src/config/param.rs_aix53.h index ba4f151f3c..ecfb3671a2 100644 --- a/src/config/param.rs_aix53.h +++ b/src/config/param.rs_aix53.h @@ -27,8 +27,6 @@ #ifdef AFS_NAMEI_ENV #define AFS_64BIT_IOPS_ENV 1 #endif -#define BITMAP_LATER 1 -#define FAST_RESTART 1 #define AFS_HAVE_FLOCK_SYSID 1 diff --git a/src/config/stds.h b/src/config/stds.h index 7b256b6735..9266b0c7f6 100644 --- a/src/config/stds.h +++ b/src/config/stds.h @@ -56,8 +56,16 @@ typedef unsigned __int64 afs_uint64; typedef long long afs_int64; typedef unsigned long long afs_uint64; #endif -#define ZeroInt64(a) (a) = 0 +#define ZeroInt64(a) (a = 0) #define AssignInt64(a, b) *(b) = (a) +#define IncInt64(a) (*(a))++ +#define IncUInt64(a) (*(a))++ +#define DecInt64(a) (*(a))-- +#define DecUInt64(a) (*(a))-- +#define GTInt64(a,b) ((a) > (b)) +#define GEInt64(a,b) ((a) >= (b)) +#define LEInt64(a,b) ((a) <= (b)) +#define LTInt64(a,b) ((a) < (b)) #define AddInt64(a,b,c) *(c) = (afs_int64)(a) + (afs_int64)(b) #define AddUInt64(a,b,c) *(c) = (afs_uint64)(a) + (afs_uint64)(b) #define SubtractInt64(a,b,c) *(c) = (afs_int64)(a) - (afs_int64)(b) @@ -83,8 +91,16 @@ struct u_Int64 { afs_uint32 low; }; typedef struct u_Int64 afs_uint64; -#define ZeroInt64(a) (a).high = (a).low = 0 +#define ZeroInt64(a) ((a).high = (a).low = 0) #define AssignInt64(a, b) (b)->high = (a).high; (b)->low = (a).low +#define IncInt64(a) ((++((a)->low)) ? 0 : (a)->high++ ) +#define IncUInt64(a) ((++((a)->low)) ? 0 : (a)->high++ ) +#define DecInt64(a) (((a)->low)-- ? 0 : (a)->high-- ) +#define DecUInt64(a) (((a)->low)-- ? 0 : (a)->high-- ) +#define GTInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low > (b).low))) +#define GEInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low >= (b).low))) +#define LEInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low <= (b).low))) +#define LTInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low < (b).low))) #define CompareInt64(a,b) (((afs_int32)(a).high - (afs_int32)(b).high) || (((a).high == (b).high) && ((a).low - (b).low))) #define AddInt64(a, b, c) { afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low + _b.low; (c)->high = _a.high + _b.high + ((c)->low < _b.low); } #define SubtractInt64(a, b, c) { afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low - _b.low; (c)->high = _a.high - _b.high - (_a.low < _b.low); } @@ -246,4 +262,9 @@ struct afsUUID { }; typedef struct afsUUID afsUUID; +/* for now, demand attach fileserver is only support on unix pthreads builds */ +#if defined(DEMAND_ATTACH_ENABLE) && defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV) +#define AFS_DEMAND_ATTACH_FS 1 +#endif + #endif /* OPENAFS_CONFIG_AFS_STDS_H */ diff --git a/src/rx/rx_queue.h b/src/rx/rx_queue.h index fcd813c407..1e930a6765 100644 --- a/src/rx/rx_queue.h +++ b/src/rx/rx_queue.h @@ -78,6 +78,13 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {} #define _RXQSP(q1,q2,i,a,b,c,d,x,y) if (!queue_IsEnd(q1,i->c)) \ (((y->b->a=q2->a)->b=y->b), ((x->a->b=q2)->a=x->a), ((i->c=q1)->d=i)) +/* This one moves a chain of elements from (s) to (e) from its + * current position to either before or after element (i) + * if (a,b,x,y) is (prev,next,s,e) then chain is moved before (i) + * if (a,b,x,y) is (next,prev,e,s) then chain is moved after (i) */ +#define _RXQMV(i, s, e, a, b, x, y) if (i->a != y) \ + (((e->next->prev=s->prev)->next=e->next), ((i->a->b=x)->a=i->a), ((y->b=i)->a=y)) + /* Basic remove operation. Doesn't update the queue item to indicate it's been removed */ #define _RXQR(i) ((_RXQ(i)->prev->next=_RXQ(i)->next)->prev=_RXQ(i)->prev) @@ -120,6 +127,12 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {} #define queue_Replace(q1,q2) if (queue_IsEmpty(q2)) queue_Init(q1); else \ (*_RXQ(q1) = *_RXQ(q2), _RXQ(q1)->next->prev = _RXQ(q1)->prev->next = _RXQ(q1), queue_Init(q2)) +/* move a chain of elements beginning at (s) and ending at (e) before node (i) */ +#define queue_MoveChainBefore(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),prev,next,_RXQ(s),_RXQ(e)) + +/* move a chain of elements beginning at (s) and ending at (e) after node (i) */ +#define queue_MoveChainAfter(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),next,prev,_RXQ(e),_RXQ(s)) + /* Remove a queue element (*i) from it's queue. The next field is 0'd, so that any further use of this q entry will hopefully cause a core dump. Multiple removes of the same queue item are not supported */ #define queue_Remove(i) (_RXQR(i), _RXQ(i)->next = 0) @@ -155,6 +168,10 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {} /* Returns false if the item was removed from a queue OR is uninitialized (zero) */ #define queue_IsOnQueue(i) (_RXQ(i)->next != 0) +/* Returns true if the item was removed from a queue OR is uninitialized (zero) */ +/* Return false if the queue item is currently in a queue */ +#define queue_IsNotOnQueue(i) (_RXQ(i)->next == 0) + /* Returns true if the queue item (i) is the first element of the queue (q) */ #define queue_IsFirst(q,i) (_RXQ(q)->first == _RXQ(i)) @@ -164,6 +181,9 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {} /* Returns true if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */ #define queue_IsEnd(q,i) (_RXQ(q) == _RXQ(i)) +/* Returns false if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */ +#define queue_IsNotEnd(q,i) (_RXQ(q) != _RXQ(i)) + /* Prototypical loop to scan an entire queue forwards. q is the queue * head, qe is the loop variable, next is a variable used to store the * queue entry for the next iteration of the loop, s is the user's @@ -180,12 +200,24 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {} !queue_IsEnd(q, qe); \ (qe) = (next), next = queue_Next(qe, s) +/* similar to queue_Scan except start at element 'start' instead of the beginning */ +#define queue_ScanFrom(q, start, qe, next, s) \ + (qe) = (struct s*)(start), next = queue_Next(qe, s); \ + !queue_IsEnd(q, qe); \ + (qe) = (next), next = queue_Next(qe, s) + /* This is similar to queue_Scan, but scans from the end of the queue to the beginning. Next is the previous queue entry. */ #define queue_ScanBackwards(q, qe, prev, s) \ (qe) = queue_Last(q, s), prev = queue_Prev(qe, s); \ !queue_IsEnd(q, qe); \ (qe) = prev, prev = queue_Prev(qe, s) +/* This is similar to queue_ScanBackwards, but start at element 'start' instead of the end. Next is the previous queue entry. */ +#define queue_ScanBackwardsFrom(q, start, qe, prev, s) \ + (qe) = (struct s*)(start), prev = queue_Prev(qe, s); \ + !queue_IsEnd(q, qe); \ + (qe) = prev, prev = queue_Prev(qe, s) + #define queue_Count(q, qe, nqe, s, n) \ for (n=0, queue_Scan(q, qe, nqe, s), n++) {} #endif /* _RX_QUEUE_ */ diff --git a/src/tsalvaged/Makefile.in b/src/tsalvaged/Makefile.in new file mode 100644 index 0000000000..1f4ccc6001 --- /dev/null +++ b/src/tsalvaged/Makefile.in @@ -0,0 +1,200 @@ +# Copyright 2000, International Business Machines Corporation and others. +# All Rights Reserved. +# +# This software has been released under the terms of the IBM Public +# License. For details, see the LICENSE file in the top-level source +# directory or online at http://www.openafs.org/dl/license10.html +# +# Portions Copyright (c) 2003 Apple Computer, Inc. +# Portions Copyright (c) 2006 Sine Nomine Associates + +srcdir=@srcdir@ +include @TOP_OBJDIR@/src/config/Makefile.config + +CC=${MT_CC} +CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT \ + -DSALVSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT + +CCRULE=${CC} ${CFLAGS} -c $? + +VICED=../viced +VLSERVER=../vlserver +LWP=../lwp +LIBACL=../libacl +UTIL=../util +DIR=../dir +VOL=../vol +FSINT=../fsint + +SALVAGEDOBJS=salvaged.o vol-salvage.o physio.o + +DIROBJS=buffer.o dir.o salvage.o + +LWPOBJS=lock.o threadname.o + +UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o flipbase64.o softsig.o fstab.o + +VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-client.o \ + clone.o nuke.o devname.o listinodes.o ihandle.o \ + namei_ops.o salvsync-server.o salvsync-client.o \ + daemon_com.o + +OBJECTS= ${SALVAGEDOBJS} ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS} + +FSSDEBUG_OBJS = fssync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS} + +SSSDEBUG_OBJS = salvsync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS} + +LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a ${TOP_LIBDIR}/libcmd.a + +INSTALL_TARGS = ${DESTDIR}${afssrvlibexecdir}/salvageserver \ + ${DESTDIR}${afssrvsbindir}/fssync-debug \ + ${DESTDIR}${afssrvsbindir}/salvsync-debug + +DEST_TARGS = ${DEST}/root.server/usr/afs/bin/salvageserver \ + ${DEST}/root.server/usr/afs/bin/fssync-debug \ + ${DEST}/root.server/usr/afs/bin/salvsync-debug + +all: salvageserver fssync-debug salvsync-debug + +salvaged.o: ${VOL}/salvaged.c + ${CCRULE} + +vol-salvage.o: ${VOL}/vol-salvage.c + ${CCRULE} + +physio.o: ${VOL}/physio.c + ${CCRULE} + +fssync-debug.o: ${VOL}/fssync-debug.c + ${CCRULE} + +salvsync-debug.o: salvsync-debug.c + ${CCRULE} + +assert.o: ${UTIL}/assert.c + ${CCRULE} + +uuid.o: ${UTIL}/uuid.c + ${CCRULE} + +serverLog.o: ${UTIL}/serverLog.c + ${CCRULE} + +fileutil.o: ${UTIL}/fileutil.c + ${CCRULE} + +volparse.o: ${UTIL}/volparse.c + ${CCRULE} + +flipbase64.o: ${UTIL}/flipbase64.c + ${CCRULE} + +netutils.o: ${UTIL}/netutils.c + ${CCRULE} + +dirpath.o: ${UTIL}/dirpath.c + ${CCRULE} + +softsig.o: ${UTIL}/softsig.c + ${CCRULE} + +buffer.o: ${DIR}/buffer.c + ${CCRULE} + +dir.o: ${DIR}/dir.c + ${CCRULE} + +salvage.o: ${DIR}/salvage.c + ${CCRULE} + +lock.o: ${LWP}/lock.c + ${CCRULE} + +threadname.o: ${LWP}/threadname.c + ${CCRULE} + +vnode.o: ${VOL}/vnode.c + ${CCRULE} + +volume.o: ${VOL}/volume.c + ${CCRULE} + +vutil.o: ${VOL}/vutil.c + ${CCRULE} + +partition.o: ${VOL}/partition.c + ${CCRULE} + +fssync-client.o: ${VOL}/fssync-client.c + ${CCRULE} + +salvsync-server.o: ${VOL}/salvsync-server.c + ${CCRULE} + +salvsync-client.o: ${VOL}/salvsync-client.c + ${CCRULE} + +daemon_com.o: ${VOL}/daemon_com.c + ${CCRULE} + +clone.o: ${VOL}/clone.c + ${CCRULE} + +nuke.o: ${VOL}/nuke.c + ${CCRULE} + +devname.o: ${VOL}/devname.c + ${CCRULE} + +# only for darwin? +fstab.o: ${UTIL}/fstab.c + ${CCRULE} + +common.o: ${VOL}/common.c + ${CCRULE} + +listinodes.o: ${VOL}/listinodes.c + ${CCRULE} + +ihandle.o: ${VOL}/ihandle.c + ${CCRULE} + +namei_ops.o: ${VOL}/namei_ops.c + ${CCRULE} + +salvageserver: ${OBJECTS} ${LIBS} + ${CC} ${LDFLAGS} -o salvageserver ${OBJECTS} ${LIBS} ${MT_LIBS} ${XLIBS} + +fssync-debug: ${FSSDEBUG_OBJS} ${LIBS} + ${CC} ${LDFLAGS} -o fssync-debug ${FSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS} + +salvsync-debug: ${SSSDEBUG_OBJS} ${LIBS} + ${CC} ${LDFLAGS} -o salvsync-debug ${SSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS} + +${DEST}/root.server/usr/afs/bin/salvageserver: salvageserver + ${INSTALL} -ns $? $@ + +${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug + ${INSTALL} -s $? $@ + +${DEST}/root.server/usr/afs/bin/salvsync-debug: salvsync-debug + ${INSTALL} -s $? $@ + +install: ${INSTALL_TARGS} + +clean: + $(RM) -f *.o salvageserver core AFS_component_version_number.c + +include ../config/Makefile.version + +${DESTDIR}${afssrvlibexecdir}/salvageserver: salvageserver + ${INSTALL} -ns $? $@ + +${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug + ${INSTALL} -s $? $@ + +${DESTDIR}${afssrvsbindir}/salvsync-debug: salvsync-debug + ${INSTALL} -s $? $@ + +dest: ${DEST_TARGS} diff --git a/src/tsalvaged/salvsync-debug.c b/src/tsalvaged/salvsync-debug.c new file mode 100644 index 0000000000..4d4949aff2 --- /dev/null +++ b/src/tsalvaged/salvsync-debug.c @@ -0,0 +1,475 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* Main program file. Define globals. */ +#define MAIN 1 + +/* + * salvsync debug tool + */ + + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include +#include +#include +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#ifndef ITIMER_REAL +#include +#endif /* ITIMER_REAL */ +#endif +#include +#include +#include + + +#include + +#ifndef AFS_NT40_ENV +#include +#endif + +#include +#include +#include + +#include "nfs.h" +#include "lwp.h" +#include "lock.h" +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include "daemon_com.h" +#include "salvsync.h" +#ifdef AFS_NT40_ENV +#include +#endif + +int VolumeChanged; /* hack to make dir package happy */ + + +#ifndef AFS_DEMAND_ATTACH_FS +int +main(int argc, char ** argv) +{ + fprintf(stderr, "*** salvsync-debug is only supported for OpenAFS builds with the demand-attach fileserver extension\n"); + return -1; +} +#else /* AFS_DEMAND_ATTACH_FS */ + +struct salv_state { + afs_uint32 prio; + afs_uint32 volume; + char partName[16]; +}; + +struct state { + afs_int32 reason; + struct salv_state * sop; +}; + +static int common_prolog(struct cmd_syndesc *, struct state *); +static int common_salv_prolog(struct cmd_syndesc *, struct state *); + +static int do_salvop(struct state *, afs_int32 command, SYNC_response * res); + +static char * response_code_to_string(afs_int32); +static char * command_code_to_string(afs_int32); +static char * reason_code_to_string(afs_int32); +static char * program_type_to_string(afs_int32); +static char * state_code_to_string(afs_int32); + + +static int OpStats(struct cmd_syndesc * as, char * rock); +static int OpSalvage(struct cmd_syndesc * as, char * rock); +static int OpCancel(struct cmd_syndesc * as, char * rock); +static int OpCancelAll(struct cmd_syndesc * as, char * rock); +static int OpRaisePrio(struct cmd_syndesc * as, char * rock); +static int OpQuery(struct cmd_syndesc * as, char * rock); + + +#ifndef AFS_NT40_ENV +#include "AFS_component_version_number.c" +#endif +#define MAX_ARGS 128 + +#define COMMON_PARMS_OFFSET 13 +#define COMMON_PARMS(ts) \ + cmd_Seek(ts, COMMON_PARMS_OFFSET); \ + cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \ + cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code") + +#define COMMON_SALV_PARMS_OFFSET 10 +#define COMMON_SALV_PARMS(ts) \ + cmd_Seek(ts, COMMON_SALV_PARMS_OFFSET); \ + cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \ + cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name"); \ + cmd_AddParm(ts, "-priority", CMD_SINGLE, CMD_OPTIONAL, "priority") + +#define SALV_PARMS_DECL(ts) \ + COMMON_SALV_PARMS(ts); \ + COMMON_PARMS(ts) + +#define COMMON_PARMS_DECL(ts) \ + COMMON_PARMS(ts) + +int +main(int argc, char **argv) +{ + struct cmd_syndesc *ts; + int err = 0; + int i; + extern char cml_version_number[]; + + /* Initialize directory paths */ + if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) { +#ifdef AFS_NT40_ENV + ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0); +#endif + fprintf(stderr, "%s: Unable to obtain AFS server directory.\n", + argv[0]); + exit(2); + } + + + ts = cmd_CreateSyntax("stats", OpStats, 0, "get salvageserver statistics (SALVSYNC_NOP opcode)"); + COMMON_PARMS_DECL(ts); + cmd_CreateAlias(ts, "nop"); + + ts = cmd_CreateSyntax("salvage", OpSalvage, 0, "schedule a salvage (SALVSYNC_SALVAGE opcode)"); + SALV_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("cancel", OpCancel, 0, "cancel a salvage (SALVSYNC_CANCEL opcode)"); + SALV_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("raiseprio", OpRaisePrio, 0, "raise a salvage priority (SALVSYNC_RAISEPRIO opcode)"); + SALV_PARMS_DECL(ts); + cmd_CreateAlias(ts, "rp"); + + ts = cmd_CreateSyntax("query", OpQuery, 0, "query salvage status (SALVSYNC_QUERY opcode)"); + SALV_PARMS_DECL(ts); + cmd_CreateAlias(ts, "qry"); + + ts = cmd_CreateSyntax("kill", OpCancelAll, 0, "cancel all scheduled salvages (SALVSYNC_CANCELALL opcode)"); + COMMON_PARMS_DECL(ts); + + err = cmd_Dispatch(argc, argv); + exit(err); +} + +static int +common_prolog(struct cmd_syndesc * as, struct state * state) +{ + register struct cmd_item *ti; + +#ifdef AFS_NT40_ENV + if (afs_winsockInit() < 0) { + Exit(1); + } +#endif + + VInitVolumePackage(debugUtility, 1, 1, + DONT_CONNECT_FS, 0); + DInit(1); + + if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) { /* -reason */ + state->reason = atoi(ti->data); + } + if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) { /* -programtype */ + if (!strcmp(ti->data, "fileServer")) { + programType = fileServer; + } else if (!strcmp(ti->data, "volumeUtility")) { + programType = volumeUtility; + } else if (!strcmp(ti->data, "salvager")) { + programType = salvager; + } else if (!strcmp(ti->data, "salvageServer")) { + programType = salvageServer; + } else { + programType = (ProgramType) atoi(ti->data); + } + } + + VConnectSALV(); + + return 0; +} + +static int +common_salv_prolog(struct cmd_syndesc * as, struct state * state) +{ + register struct cmd_item *ti; + char pname[100], *temp; + + state->sop = (struct salv_state *) calloc(1, sizeof(struct salv_state)); + assert(state->sop != NULL); + + if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET].items)) { /* -volumeid */ + state->sop->volume = atoi(ti->data); + } else { + fprintf(stderr, "required argument -volumeid not given\n"); + } + + if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+1].items)) { /* -partition */ + strlcpy(state->sop->partName, ti->data, sizeof(state->sop->partName)); + } else { + memset(state->sop->partName, 0, sizeof(state->sop->partName)); + } + + if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+2].items)) { /* -prio */ + state->sop->prio = atoi(ti->data); + } else { + state->sop->prio = 0; + } + + return 0; +} + +static int +do_salvop(struct state * state, afs_int32 command, SYNC_response * res) +{ + afs_int32 code; + SALVSYNC_response_hdr hdr_l, *hdr; + SYNC_response res_l; + + if (!res) { + res = &res_l; + res->payload.len = sizeof(hdr_l); + res->payload.buf = hdr = &hdr_l; + } else { + hdr = (SALVSYNC_response_hdr *) res->payload.buf; + } + + fprintf(stderr, "calling SALVSYNC_SalvageVolume with command code %d (%s)\n", + command, command_code_to_string(command)); + + code = SALVSYNC_SalvageVolume(state->sop->volume, + state->sop->partName, + command, + state->reason, + state->sop->prio, + res); + + switch (code) { + case SYNC_OK: + case SYNC_DENIED: + break; + default: + fprintf(stderr, "possible sync protocol error. return code was %d\n", code); + } + + fprintf(stderr, "SALVSYNC_SalvageVolume returned %d (%s)\n", code, response_code_to_string(code)); + fprintf(stderr, "protocol response code was %d (%s)\n", + res->hdr.response, response_code_to_string(res->hdr.response)); + fprintf(stderr, "protocol reason code was %d (%s)\n", + res->hdr.reason, reason_code_to_string(res->hdr.reason)); + + printf("state = {\n"); + if (res->hdr.flags & SALVSYNC_FLAG_VOL_STATS_VALID) { + printf("\tstate = %d (%s)\n", + hdr->state, state_code_to_string(hdr->state)); + printf("\tprio = %d\n", hdr->prio); + } + printf("\tsq_len = %d\n", hdr->sq_len); + printf("\tpq_len = %d\n", hdr->pq_len); + printf("}\n"); + + VDisconnectSALV(); +} + +static char * +response_code_to_string(afs_int32 response) +{ + switch (response) { + case SYNC_OK: + return "SYNC_OK"; + case SYNC_DENIED: + return "SYNC_DENIED"; + case SYNC_COM_ERROR: + return "SYNC_COM_ERROR"; + case SYNC_BAD_COMMAND: + return "SYNC_BAD_COMMAND"; + case SYNC_FAILED: + return "SYNC_FAILED"; + default: + return "**UNKNOWN**"; + } +} + +static char * +command_code_to_string(afs_int32 command) +{ + switch (command) { + case SYNC_COM_CHANNEL_CLOSE: + return "SYNC_COM_CHANNEL_CLOSE"; + case SALVSYNC_NOP: + return "SALVSYNC_NOP"; + case SALVSYNC_SALVAGE: + return "SALVSYNC_SALVAGE"; + case SALVSYNC_CANCEL: + return "SALVSYNC_CANCEL"; + case SALVSYNC_RAISEPRIO: + return "SALVSYNC_RAISEPRIO"; + case SALVSYNC_QUERY: + return "SALVSYNC_QUERY"; + case SALVSYNC_CANCELALL: + return "SALVSYNC_CANCELLALL"; + default: + return "**UNKNOWN**"; + } +} + +static char * +reason_code_to_string(afs_int32 reason) +{ + switch (reason) { + case SALVSYNC_WHATEVER: + return "SALVSYNC_WHATEVER"; + case SALVSYNC_ERROR: + return "SALVSYNC_ERROR"; + case SALVSYNC_OPERATOR: + return "SALVSYNC_OPERATOR"; + case SALVSYNC_SHUTDOWN: + return "SALVSYNC_SHUTDOWN"; + case SALVSYNC_NEEDED: + return "SALVSYNC_NEEDED"; + default: + return "**UNKNOWN**"; + } +} + +static char * +program_type_to_string(afs_int32 type) +{ + switch ((ProgramType)type) { + case fileServer: + return "fileServer"; + case volumeUtility: + return "volumeUtility"; + case salvager: + return "salvager"; + case salvageServer: + return "salvageServer"; + default: + return "**UNKNOWN**"; + } +} + +static char * +state_code_to_string(afs_int32 state) +{ + switch (state) { + case SALVSYNC_STATE_UNKNOWN: + return "SALVSYNC_STATE_UNKNOWN"; + case SALVSYNC_STATE_QUEUED: + return "SALVSYNC_STATE_QUEUED"; + case SALVSYNC_STATE_SALVAGING: + return "SALVSYNC_STATE_SALVAGING"; + case SALVSYNC_STATE_ERROR: + return "SALVSYNC_STATE_ERROR"; + case SALVSYNC_STATE_DONE: + return "SALVSYNC_STATE_DONE"; + default: + return "**UNKNOWN**"; + } +} + +static int +OpStats(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_NOP, NULL); + + return 0; +} + +static int +OpSalvage(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_SALVAGE, NULL); + + return 0; +} + +static int +OpCancel(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_CANCEL, NULL); + + return 0; +} + +static int +OpCancelAll(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_CANCELALL, NULL); + + return 0; +} + +static int +OpRaisePrio(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_RAISEPRIO, NULL); + + return 0; +} + +static int +OpQuery(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_salv_prolog(as, &state); + + do_salvop(&state, SALVSYNC_QUERY, NULL); + + return 0; +} + +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/tviced/Makefile.in b/src/tviced/Makefile.in index b10e1a4ca8..68363fc543 100644 --- a/src/tviced/Makefile.in +++ b/src/tviced/Makefile.in @@ -11,7 +11,7 @@ srcdir=@srcdir@ include @TOP_OBJDIR@/src/config/Makefile.config CC=${MT_CC} -CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG +CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT CCRULE=${CC} ${CFLAGS} -c $? @@ -24,7 +24,7 @@ DIR=../dir VOL=../vol FSINT=../fsint -VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o +VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o serialize_state.o VLSERVEROBJS=vldbint.cs.o vldbint.xdr.o @@ -36,18 +36,20 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o DIROBJS=buffer.o dir.o salvage.o -VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \ +VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-server.o \ clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o \ - fstab.o + fstab.o salvsync-client.o daemon_com.o FSINTOBJS= afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o objects= ${VICEDOBJS} ${VLSERVEROBJS} ${LWPOBJS} ${LIBACLOBJS} \ ${UTILOBJS} ${DIROBJS} ${VOLOBJS} ${FSINTOBJS} +SDBGOBJS = state_analyzer.o uuid.o dirpath.o fileutil.o ${TOP_LIBDIR}/util.a + LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a -all: fileserver +all: fileserver state_analyzer viced.o: ${VICED}/viced.c ${CCRULE} @@ -64,6 +66,9 @@ physio.o: ${VICED}/physio.c callback.o: ${VICED}/callback.c ${CCRULE} +serialize_state.o: ./serialize_state.c + ${CCRULE} + assert.o: ${UTIL}/assert.c ${CCRULE} @@ -130,10 +135,16 @@ vutil.o: ${VOL}/vutil.c partition.o: ${VOL}/partition.c ${CCRULE} -fssync.o: ${VOL}/fssync.c +fssync-server.o: ${VOL}/fssync-server.c ${CCRULE} -purge.o: ${VOL}/purge.c +fssync-client.o: ${VOL}/fssync-client.c + ${CCRULE} + +salvsync-client.o: ${VOL}/salvsync-client.c + ${CCRULE} + +daemon_com.o: ${VOL}/daemon_com.c ${CCRULE} clone.o: ${VOL}/clone.c @@ -179,21 +190,33 @@ afsint.ss.o: ${FSINT}/afsint.ss.c afsint.xdr.o: ${FSINT}/afsint.xdr.c ${CCRULE} +state_analyzer.o: state_analyzer.c + ${CCRULE} + fileserver: ${objects} ${LIBS} ${CC} ${LDFLAGS} -o fileserver ${objects} ${LIBS} ${MT_LIBS} ${XLIBS} +state_analyzer: ${SDBGOBJS} + ${CC} ${LDFLAGS} -o state_analyzer ${SDBGOBJS} ${MT_LIBS} ${XLIBS} + ${DEST}/root.server/usr/afs/bin/fileserver: fileserver ${INSTALL} -ns $? $@ -install: ${DESTDIR}${afssrvlibexecdir}/fileserver +${DEST}/root.server/usr/afs/bin/state_analyzer: state_analyzer + ${INSTALL} $? $@ + +install: ${DESTDIR}${afssrvlibexecdir}/fileserver ${DESTDIR}${afssrvsbindir}/state_analyzer clean: - $(RM) -f *.o fileserver core AFS_component_version_number.c + $(RM) -f *.o fileserver state_analyzer core AFS_component_version_number.c include ../config/Makefile.version ${DESTDIR}${afssrvlibexecdir}/fileserver: fileserver ${INSTALL} -ns $? $@ -dest: ${DEST}/root.server/usr/afs/bin/fileserver +${DESTDIR}${afssrvsbindir}/state_analyzer: state_analyzer + ${INSTALL} $? $@ + +dest: ${DEST}/root.server/usr/afs/bin/fileserver ${DEST}/root.server/usr/afs/bin/state_analyzer diff --git a/src/tviced/NTMakefile b/src/tviced/NTMakefile index e9e2c270e9..e58c5cc226 100644 --- a/src/tviced/NTMakefile +++ b/src/tviced/NTMakefile @@ -5,7 +5,7 @@ # License. For details, see the LICENSE file in the top-level source # directory or online at http://www.openafs.org/dl/license10.html -AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG +AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG -DFSSYNC_BUILD_SERVER RELDIR=tviced !INCLUDE ..\config\NTMakefile.$(SYS_NAME) diff --git a/src/tviced/serialize_state.c b/src/tviced/serialize_state.c new file mode 100644 index 0000000000..c1b4583153 --- /dev/null +++ b/src/tviced/serialize_state.c @@ -0,0 +1,1120 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * demand attach fs + * fileserver state serialization + */ + +#include +#include + +RCSID + ("$Header$"); + +#include +#include /* for malloc() */ +#include /* ANSI standard location for time stuff */ +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#endif +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../viced/viced_prototypes.h" +#include "../viced/viced.h" +#include "../viced/host.h" +#include "../viced/callback.h" +#include "serialize_state.h" + +/*@+fcnmacros +macrofcndecl@*/ +#ifdef O_LARGEFILE +#ifdef S_SPLINT_S +extern off64_t afs_lseek(int FD, off64_t O, int F); +#endif /*S_SPLINT_S */ +#define afs_lseek(FD, O, F) lseek64(FD, (off64_t)(O), F) +#define afs_stat stat64 +#define afs_fstat fstat64 +#define afs_open open64 +#define afs_fopen fopen64 +#define afs_ftruncate ftruncate64 +#define afs_mmap mmap64 +#ifdef AFS_AIX_ENV +extern void * mmap64(); /* ugly hack since aix build env appears to be somewhat broken */ +#endif +#else /* !O_LARGEFILE */ +#ifdef S_SPLINT_S +extern off_t afs_lseek(int FD, off_t O, int F); +#endif /*S_SPLINT_S */ +#define afs_lseek(FD, O, F) lseek(FD, (off_t)(O), F) +#define afs_stat stat +#define afs_fstat fstat +#define afs_open open +#define afs_fopen fopen +#define afs_ftruncate ftruncate +#define afs_mmap mmap +#endif /* !O_LARGEFILE */ +/*@=fcnmacros =macrofcndecl@*/ + + +#ifdef AFS_DEMAND_ATTACH_FS + +/* + * demand attach fs + * state dump routines + * + * in order to make state dump/restore as fast as possible, + * we use memory mapped files + * + * if this causes problems on certain platforms, the APIs + * have been written so that it will be very simple to go + * back to standard I/O for just those poorly written platforms + */ +#define FS_STATE_USE_MMAP + + +#ifdef FS_STATE_USE_MMAP +#define FS_STATE_INIT_FILESIZE (8 * 1024 * 1024) /* truncate to 8MB initially */ +#include +#endif + +static int fs_stateCreateDump(struct fs_dump_state * state); +static int fs_stateLoadDump(struct fs_dump_state * state); +static int fs_stateInvalidateDump(struct fs_dump_state * state); +static int fs_stateCommitDump(struct fs_dump_state * state); +static int fs_stateCloseDump(struct fs_dump_state * state); + +#ifdef FS_STATE_USE_MMAP +static int fs_stateSizeFile(struct fs_dump_state * state); +static int fs_stateResizeFile(struct fs_dump_state * state, size_t min_add); +static int fs_stateTruncateFile(struct fs_dump_state * state); + +static int fs_stateMapFile(struct fs_dump_state * state); +static int fs_stateUnmapFile(struct fs_dump_state * state); + +static int fs_stateIncCursor(struct fs_dump_state * state, size_t len); +static int fs_stateCheckIOSafety(struct fs_dump_state * state, + size_t len); +#endif + +static int fs_stateFillHeader(struct fs_state_header * hdr); +static int fs_stateCheckHeader(struct fs_state_header * hdr); + +static int fs_stateAlloc(struct fs_dump_state * state); +static int fs_stateFree(struct fs_dump_state * state); + +extern afsUUID FS_HostUUID; +extern char cml_version_number[]; + +/* + * demand attach fs + * save all fileserver state + */ +int +fs_stateSave(void) +{ + int ret = 0, verified = 1; + struct fs_dump_state state; + + /* save and restore need to be atomic wrt other host package operations */ + H_LOCK; + + ViceLog(0, ("fs_stateSave: commencing fileserver state dump\n")); + + if (fs_stateAlloc(&state)) { + ViceLog(0, ("fs_stateSave: memory allocation failed; dump aborted\n")); + ret = 1; + goto done; + } + + /* XXX + * on busy servers, these checks will inevitably fail since stuff drops H_LOCK + * all over the place (with structs left in inconsistent states) while RPCs to + * clients happen (grumble, grumble, the host package needs to be rewritten...) + * + * the current hack is to force the background threads that deal with host and + * callback state offline early in the shutdown process, do VShutdown, come + * back and wait for those threads to die, THEN do the state dump + * + * BUT, this still has one flaw -- what do we do about rx worker threads that + * are blocked in the host package making an RPC call to a cm??? + * + * perhaps we need a refcounter that keeps track of threads blocked in rpc calls + * with H_LOCK dropped (and the host struct likely left in an inconsistent state) + * + * or better yet, we need to associate a state machine with each host object + * (kind of like demand attach Volume structures). + * + * sigh. I suspect we'll need to revisit this issue + */ + + if (fs_state.options.fs_state_verify_before_save) { + ViceLog(0, ("fs_stateSave: performing internal consistency checks before proceeding with state dump\n")); + + if (h_stateVerify(&state)) { + ViceLog(0, ("fs_stateSave: error: host table consistency checks failed; state dump will not be marked clean\n")); + verified = 0; + ret = 1; + } + + if (cb_stateVerify(&state)) { + ViceLog(0, ("fs_stateSave: error: callback table consistency checks failed; state dump will not be marked clean\n")); + verified = 0; + ret = 1; + } + + /* if a consistency check asserted the bail flag, reset it */ + state.bail = 0; + + ViceLog(0, ("fs_stateSave: proceeding with dump\n")); + } + + if (fs_stateCreateDump(&state)) { + ViceLog(0, ("fs_stateSave: error: dump create failed\n")); + ret = 1; + goto done; + } + + if (h_stateSave(&state)) { + ViceLog(0, ("fs_stateSave: error: host state dump failed\n")); + ret = 1; + goto done; + } + + if (cb_stateSave(&state)) { + ViceLog(0, ("fs_stateSave: error: callback state dump failed\n")); + ret = 1; + goto done; + } + + if (!verified) { + state.bail = 1; + } + + if (fs_stateCommitDump(&state)) { + ViceLog(0, ("fs_stateSave: error: dump commit failed\n")); + ret = 1; + goto done; + } + + if (verified) { + ViceLog(0, ("fs_stateSave: fileserver state dump completed successfully\n")); + } else { + ViceLog(0, ("fs_stateSave: fileserver state dump completed, but not marked clean.\n")); + ViceLog(0, ("fs_stateSave: please save a copy of '%s' for use by technical support\n", + state.fn)); + } + + done: + if (state.fd >= 0) + fs_stateCloseDump(&state); + fs_stateFree(&state); + H_UNLOCK; + return ret; +} + +/* + * demand attach fs + * restore all fileserver state + * + * this function must appear as one atomic operation to the host and callback + * packages, hence H_LOCK is held for the entirety of the process. + */ +int +fs_stateRestore(void) +{ + int ret = 0; + struct fs_dump_state state; + + /* save and restore need to be atomic wrt other host package operations */ + H_LOCK; + + ViceLog(0, ("fs_stateRestore: commencing fileserver state restore\n")); + + if (fs_stateAlloc(&state)) { + ViceLog(0, ("fs_stateRestore: memory allocation failed\n")); + ret = 1; + goto done; + } + + if (fs_stateLoadDump(&state)) { + ViceLog(0, ("fs_stateRestore: failed to load dump file '%s'\n", state.fn)); + ret = 1; + goto done; + } + + if (fs_stateInvalidateDump(&state)) { + ViceLog(0, ("fs_stateRestore: failed to invalidate dump file '%s'\n", state.fn)); + ret = 1; + goto done; + } + + + if (state.flags.do_host_restore) { + if (h_stateRestore(&state)) { + ViceLog(0, ("fs_stateRestore: error: host state restore failed. exiting avoid further corruption\n")); + exit(0); + } + ViceLog(0, ("fs_stateRestore: host table restored\n")); + + if (cb_stateRestore(&state)) { + ViceLog(0, ("fs_stateRestore: error: callback state restore failed. exiting to avoid further corruption\n")); + exit(0); + } + ViceLog(0, ("fs_stateRestore: FileEntry and CallBack tables restored\n")); + + if (h_stateRestoreIndices(&state)) { + ViceLog(0, ("fs_stateRestore: error: host index remapping failed. exiting to avoid further corruption\n")); + exit(0); + } + ViceLog(0, ("fs_stateRestore: host table indices remapped\n")); + + if (cb_stateRestoreIndices(&state)) { + ViceLog(0, ("fs_stateRestore: error: callback index remapping failed. exiting to avoid further corruption\n")); + exit(0); + } + ViceLog(0, ("fs_stateRestore: FileEntry and CallBack indices remapped\n")); + } + + ViceLog(0, ("fs_stateRestore: restore phase complete\n")); + + if (fs_state.options.fs_state_verify_after_restore) { + ViceLog(0, ("fs_stateRestore: beginning state verification phase\n")); + + if (state.flags.do_host_restore) { + if (h_stateVerify(&state)) { + ViceLog(0, ("fs_stateRestore: error: host table consistency checks failed; exiting to avoid further corruption\n")); + exit(0); + } + + if (cb_stateVerify(&state)) { + ViceLog(0, ("fs_stateRestore: error: callback table consistency checks failed; exiting to avoid further corruption\n")); + exit(0); + } + } + + ViceLog(0, ("fs_stateRestore: fileserver state verification complete\n")); + } + + ViceLog(0, ("fs_stateRestore: restore was successful\n")); + + done: + if (state.fd >= 0) { + fs_stateInvalidateDump(&state); + fs_stateCloseDump(&state); + } + fs_stateFree(&state); + H_UNLOCK; + return ret; +} + +static int +fs_stateCreateDump(struct fs_dump_state * state) +{ + int fd, ret = 0; + char savedump[MAXPATHLEN]; + struct afs_stat status; + + afs_snprintf(savedump, sizeof(savedump), "%s.old", state->fn); + + if (afs_stat(state->fn, &status) == 0) { + renamefile(state->fn, savedump); + } + + if (((fd = afs_open(state->fn, + O_RDWR | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR)) == -1) || + (afs_fstat(fd, &status) == -1)) { + ViceLog(0, ("fs_stateCreateDump: failed to create state dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + state->fd = fd; + state->mode = FS_STATE_DUMP_MODE; + memset(state->hdr, 0, sizeof(struct fs_state_header)); + fs_stateIncEOF(state, sizeof(struct fs_state_header)); + +#ifdef FS_STATE_USE_MMAP + if (fs_stateSizeFile(state)) { + ViceLog(0, ("fs_stateCreateDump: failed to resize state dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + if (fs_stateMapFile(state)) { + ViceLog(0, ("fs_stateCreateDump: failed to memory map state dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } +#endif + + ret = fs_stateInvalidateDump(state); + + done: + return ret; +} + +static int +fs_stateInvalidateDump(struct fs_dump_state * state) +{ + afs_uint64 z; + int ret = 0; + struct fs_state_header hdr; + +#ifdef FS_STATE_USE_MMAP + if (state->mmap.map == NULL) { + return 1; + } +#endif + + memcpy(&hdr, state->hdr, sizeof(hdr)); + hdr.valid = 0; + ZeroInt64(z); + + /* write a bogus header to flag dump in progress */ + if (fs_stateWriteHeader(state, &z, &hdr, sizeof(hdr))) { + ViceLog(0, ("fs_stateInvalidateDump: failed to invalidate old dump file header '%s'\n", + state->fn)); + ret = 1; + goto done; + } + if (fs_stateSync(state)) { + ViceLog(0, ("fs_stateInvalidateDump: failed to sync changes to disk\n")); + ret = 1; + goto done; + } + + done: + return ret; +} + +static int +fs_stateCommitDump(struct fs_dump_state * state) +{ + afs_uint64 z; + int ret = 0; + + ZeroInt64(z); + +#ifdef FS_STATE_USE_MMAP + if (fs_stateTruncateFile(state)) { + ViceLog(0, ("fs_stateCommitDump: failed to truncate dump file to proper size\n")); + ret = 1; + goto done; + } +#endif + + /* ensure that all pending data I/Os for the state file have been committed + * _before_ we make the metadata I/Os */ + if (fs_stateSync(state)) { + ViceLog(0, ("fs_stateCommitDump: failed to sync changes to disk\n")); + ret = 1; + goto done; + } + +#ifdef FS_STATE_USE_MMAP + /* XXX madvise may not exist on all platforms, so + * we may need to add some ifdefs at some point... */ + { + madvise((((char *)state->mmap.map) + sizeof(struct fs_state_header)), + state->mmap.size - sizeof(struct fs_state_header), + MADV_DONTNEED); + } +#endif + + /* build the header, and write it to disk */ + fs_stateFillHeader(state->hdr); + if (state->bail) { + state->hdr->valid = 0; + } + if (fs_stateWriteHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) { + ViceLog(0, ("fs_stateCommitDump: failed to write header to dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + if (fs_stateSync(state)) { + ViceLog(0, ("fs_stateCommitDump: failed to sync new header to disk\n")); + ret = 1; + goto done; + } + + done: + return ret; +} + +static int +fs_stateLoadDump(struct fs_dump_state * state) +{ + afs_uint64 z; + int fd, ret = 0; + struct afs_stat status; + afs_int32 now = FT_ApproxTime(); + + ZeroInt64(z); + + if ((fd = afs_open(state->fn, O_RDWR)) == -1 || + (afs_fstat(fd, &status) == -1)) { + ViceLog(0, ("fs_stateLoadDump: failed to load state dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + state->fd = fd; + state->mode = FS_STATE_LOAD_MODE; + state->file_len = status.st_size; + +#ifdef FS_STATE_USE_MMAP + if (fs_stateMapFile(state)) { + ViceLog(0, ("fs_stateLoadDump: failed to memory map state dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } +#endif + + if (fs_stateReadHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) { + ViceLog(0, ("fs_stateLoadDump: failed to read header from dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + /* check the validity of the header */ + if (fs_stateCheckHeader(state->hdr)) { + ViceLog(1, ("fs_stateLoadDump: header failed validity checks; not restoring '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + if ((state->hdr->timestamp + HOST_STATE_VALID_WINDOW) >= now) { + state->flags.do_host_restore = 1; + } else { + ViceLog(0, ("fs_stateLoadDump: warning: dump is too old for host and callback restore; skipping those steps\n")); + } + + done: + return ret; +} + +static int +fs_stateCloseDump(struct fs_dump_state * state) +{ +#ifdef FS_STATE_USE_MMAP + fs_stateUnmapFile(state); +#endif + close(state->fd); + return 0; +} + +int +fs_stateWrite(struct fs_dump_state * state, + void * buf, size_t len) +{ + int ret = 0; + +#ifdef FS_STATE_USE_MMAP + if (fs_stateCheckIOSafety(state, len)) { + if (fs_stateResizeFile(state, len)) { + ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + } + + memcpy(state->mmap.cursor, buf, len); + fs_stateIncCursor(state, len); +#else + if (write(state->fd, buf, len) != len) { + ViceLog(0, ("fs_stateWrite: write failed\n")); + ret = 1; + goto done; + } +#endif + + done: + return ret; +} + +int +fs_stateRead(struct fs_dump_state * state, + void * buf, size_t len) +{ + int ret = 0; + +#ifdef FS_STATE_USE_MMAP + if (fs_stateCheckIOSafety(state, len)) { + ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + memcpy(buf, state->mmap.cursor, len); + fs_stateIncCursor(state, len); +#else + if (read(state->fd, buf, len) != len) { + ViceLog(0, ("fs_stateRead: read failed\n")); + ret = 1; + goto done; + } +#endif + + done: + return ret; +} + +int +fs_stateWriteV(struct fs_dump_state * state, + struct iovec * iov, int niov) +{ + int i, ret = 0; + size_t len = 0; + + for (i=0; i < niov; i++) { + len += iov[i].iov_len; + } + +#ifdef FS_STATE_USE_MMAP + if (fs_stateCheckIOSafety(state, len)) { + if (fs_stateResizeFile(state, len)) { + ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + } + + for (i=0; i < niov; i++) { + memcpy(state->mmap.cursor, iov[i].iov_base, iov[i].iov_len); + fs_stateIncCursor(state, iov[i].iov_len); + } +#else + if (writev(state->fd, iov, niov) != len) { + ViceLog(0, ("fs_stateWriteV: write failed\n")); + ret = 1; + goto done; + } +#endif + + done: + return ret; +} + +int +fs_stateReadV(struct fs_dump_state * state, + struct iovec * iov, int niov) +{ + int i, ret = 0; + size_t len = 0; + + for (i=0; i < niov; i++) { + len += iov[i].iov_len; + } + +#ifdef FS_STATE_USE_MMAP + if (fs_stateCheckIOSafety(state, len)) { + ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + for (i=0; i < niov; i++) { + memcpy(iov[i].iov_base, state->mmap.cursor, iov[i].iov_len); + fs_stateIncCursor(state, iov[i].iov_len); + } +#else + if (readv(state->fd, iov, niov) != len) { + ViceLog(0, ("fs_stateReadV: read failed\n")); + ret = 1; + goto done; + } +#endif + + done: + return ret; +} + +int +fs_stateWriteHeader(struct fs_dump_state * state, + afs_uint64 * offset, + void * hdr, size_t len) +{ + int ret = 0; + + if (fs_stateSeek(state, offset)) { + ViceLog(0, ("fs_stateWriteHeader: could not seek to correct position in dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + if (fs_stateWrite(state, hdr, len)) { + ViceLog(0, ("fs_stateWriteHeader: write failed\n")); + ret = 1; + goto done; + } + + done: + return ret; +} + +int +fs_stateReadHeader(struct fs_dump_state * state, + afs_uint64 * offset, + void * hdr, size_t len) +{ + int ret = 0; + + if (fs_stateSeek(state, offset)) { + ViceLog(0, ("fs_stateReadHeader: could not seek to correct position in dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + if (fs_stateRead(state, hdr,len)) { + ViceLog(0, ("fs_stateReadHeader: read failed\n")); + ret = 1; + goto done; + } + + done: + return ret; +} + +#ifdef FS_STATE_USE_MMAP +static int +fs_stateSizeFile(struct fs_dump_state * state) +{ + int ret = 0; + state->file_len = FS_STATE_INIT_FILESIZE; + if (afs_ftruncate(state->fd, state->file_len) != 0) + ret = 1; + return ret; +} + +static int +fs_stateResizeFile(struct fs_dump_state * state, size_t min_add) +{ + int ret = 0; + afs_foff_t inc; + +#ifdef FS_STATE_USE_MMAP + fs_stateUnmapFile(state); +#endif + + inc = ((min_add / FS_STATE_INIT_FILESIZE)+1) * FS_STATE_INIT_FILESIZE; + state->file_len += inc; + + if (afs_ftruncate(state->fd, state->file_len) != 0) { + ViceLog(0, ("fs_stateResizeFile: truncate failed\n")); + ret = 1; + goto done; + } + +#ifdef FS_STATE_USE_MMAP + if (fs_stateMapFile(state)) { + ViceLog(0, ("fs_stateResizeFile: remapping memory mapped file failed\n")); + ret = 1; + goto done; + } +#endif + + done: + return ret; +} + +static int +fs_stateTruncateFile(struct fs_dump_state * state) +{ + int ret = 0; + +#ifdef AFS_LARGEFILE_ENV + if (afs_ftruncate(state->fd, state->eof_offset) != 0) { + ret = 1; + } +#else + afs_uint32 hi, lo; + SplitInt64(state->eof_offset, hi, lo); + if (afs_ftruncate(state->fd, lo) != 0) { + ret = 1; + } +#endif + + return ret; +} +#endif + +#ifdef FS_STATE_USE_MMAP +static int +fs_stateMapFile(struct fs_dump_state * state) +{ + int ret = 0, flags; + + switch(state->mode) { + case FS_STATE_LOAD_MODE: + flags = PROT_READ | PROT_WRITE; /* loading involves a header invalidation */ + break; + case FS_STATE_DUMP_MODE: + flags = PROT_WRITE; + break; + default: + ViceLog(0, ("fs_stateMapFile: invalid dump state mode\n")); + return 1; + } + + state->mmap.map = afs_mmap(NULL, + state->file_len, + flags, + MAP_SHARED, + state->fd, + 0); + + if (state->mmap.map == MAP_FAILED) { + state->mmap.size = 0; + state->mmap.map = NULL; + ViceLog(0, ("fs_stateMapFile: failed to memory map file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + state->mmap.size = state->file_len; + state->mmap.cursor = state->mmap.map; + state->mmap.offset = 0; + + /* for state loading, accesses will be sequential, so let's give + * the VM subsystem a heads up */ + if (state->mode == FS_STATE_LOAD_MODE) { + /* XXX madvise may not exist on all platforms, so + * we may need to add some ifdefs at some point... */ + flags = MADV_SEQUENTIAL | MADV_WILLNEED; +#ifdef AFS_SUN510_ENV + flags |= MADV_ACCESS_LWP; /* added in solaris 9 12/02 */ +#endif + madvise(state->mmap.map, state->mmap.size, flags); + } + + done: + return ret; +} + +static int +fs_stateUnmapFile(struct fs_dump_state * state) +{ + int ret = 0; + + if (munmap(state->mmap.map, state->mmap.size) == -1) { + ViceLog(0, ("fs_stateUnmapFile: failed to unmap dump file '%s'\n", + state->fn)); + ret = 1; + goto done; + } + + done: + return ret; +} +#endif /* FS_STATE_USE_MMAP */ + +#ifdef FS_STATE_USE_MMAP +int +fs_stateSync(struct fs_dump_state * state) +{ + int ret = 0; + + msync(state->mmap.map, state->mmap.size, MS_SYNC); + + done: + return ret; +} +#else /* !FS_STATE_USE_MMAP */ +int +fs_stateSync(struct fs_dump_state * state) +{ + int ret = 0; + + if (fsync(state->fd) == -1) + ret = 1; + + done: + return ret; +} +#endif /* !FS_STATE_USE_MMAP */ + +int +fs_stateIncEOF(struct fs_dump_state * state, afs_int32 len) +{ + afs_uint64 temp; + FillInt64(temp, 0, len); + AddUInt64(state->eof_offset, temp, &state->eof_offset); + return 0; +} + +#ifdef FS_STATE_USE_MMAP +static int +fs_stateIncCursor(struct fs_dump_state * state, size_t len) +{ + char * p; + + state->mmap.offset += len; + + p = (char *) state->mmap.cursor; + p += len; + state->mmap.cursor = (void *) p; + + return 0; +} + +static int +fs_stateCheckIOSafety(struct fs_dump_state * state, size_t len) +{ + int ret = 0; + + if ((state->mmap.offset + len) > state->mmap.size) { + ret = 1; + } + return ret; +} +#endif /* FS_STATE_USE_MMAP */ + +#ifdef FS_STATE_USE_MMAP +int +fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset) +{ + int ret = 0; + char * p; + afs_uint32 hi, lo; + + SplitInt64(*offset, hi, lo); + + /* update cursor */ + p = (char *) state->mmap.map; +#ifdef AFS_64BIT_ENV + p += *offset; +#else + p += lo; +#endif + state->mmap.cursor = (void *) p; + + /* update offset */ +#ifdef AFS_LARGEFILE_ENV + state->mmap.offset = *offset; +#else + if (hi) + ret = 1; + state->mmap.offset = lo; +#endif + + return ret; +} +#else /* !FS_STATE_USE_MMAP */ +int +fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset) +{ + int ret = 0; +#ifndef AFS_LARGEFILE_ENV + afs_uint32 high, low; + + SplitInt64(*offset, high, low); + if (high) { + ret = 1; + goto done; + } + + if (afs_lseek(state->fd, low, SEEK_SET) == -1) + ret = 1; +#else + if (afs_lseek(state->fd, *offset, SEEK_SET) == -1) + ret = 1; +#endif + return ret; +} +#endif /* !FS_STATE_USE_MMAP */ + +static int +fs_stateFillHeader(struct fs_state_header * hdr) +{ + hdr->stamp.magic = FS_STATE_MAGIC; + hdr->stamp.version = FS_STATE_VERSION; +#ifdef SYS_NAME_ID + hdr->sys_name = SYS_NAME_ID; +#else + hdr->sys_name = 0xFFFFFFFF; +#endif + hdr->timestamp = FT_ApproxTime(); + hdr->server_uuid = FS_HostUUID; + hdr->valid = 1; +#ifdef AFSBIG_ENDIAN + hdr->endianness = 1; +#else + hdr->endianness = 0; +#endif +#ifdef FS_STATS_DETAILED + hdr->stats_detailed = 1; +#else + hdr->stats_detailed = 0; +#endif + if (strlcpy(hdr->server_version_string, cml_version_number, sizeof(hdr->server_version_string)) + >= sizeof(hdr->server_version_string)) { + ViceLog(0, ("fs_stateFillHeader: WARNING -- cml_version_number field truncated\n")); + } + return 0; +} + +static int +fs_stateCheckHeader(struct fs_state_header * hdr) +{ + int ret = 0; + + if (!hdr->valid) { + ViceLog(0, ("fs_stateCheckHeader: dump was previously flagged invalid\n")); + ret = 1; + } +#ifdef AFSBIG_ENDIAN + else if (!hdr->endianness) { + ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n")); + ret = 1; + } +#else /* AFSLITTLE_ENDIAN */ + else if (hdr->endianness) { + ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n")); + ret = 1; + } +#endif /* AFSLITTLE_ENDIAN */ + + else if (hdr->stamp.magic != FS_STATE_MAGIC) { + ViceLog(0, ("fs_stateCheckHeader: invalid dump header\n")); + ret = 1; + } + else if (hdr->stamp.version != FS_STATE_VERSION) { + ViceLog(0, ("fs_stateCheckHeader: unknown dump format version number\n")); + ret = 1; + } + +#ifdef FS_STATS_DETAILED + else if (!hdr->stats_detailed) { + ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n")); + ret = 1; + } +#else /* FS_STATS_DETAILED */ + else if (hdr->stats_detailed) { + ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n")); + ret = 1; + } +#endif /* FS_STATS_DETAILED */ + + else if (!afs_uuid_equal(&hdr->server_uuid, &FS_HostUUID)) { + ViceLog(0, ("fs_stateCheckHeader: server UUID does not match this server's UUID\n")); + ret = 1; + } + + /* the cml_version_string is included for informational purposes only. If someone ever + * wants to limit state dump reloading based upon the contents of this string, just + * uncomment the following code. uncommenting this code is _strongly discouraged_ because + * we already make use of the version stamps in the various dump headers to deal with + * data structure version incompatabilities. + else if (strncmp(hdr->server_version_string, cml_version_number, + sizeof(hdr->server_version_string)) != 0) { + ViceLog(0, ("fs_stateCheckHeader: dump from different server version\n")); + ret = 1; + } + */ + + else if (strncmp(hdr->server_version_string, cml_version_number, + sizeof(hdr->server_version_string)) != 0) { + ViceLog(0, ("fs_stateCheckHeader: dump from different server version ; attempting state reload anyway\n")); + } + + + return ret; +} + +static int +fs_stateAlloc(struct fs_dump_state * state) +{ + int ret = 0; + memset(state, 0, sizeof(struct fs_dump_state)); + state->fd = -1; + state->fn = AFSDIR_SERVER_FSSTATE_FILEPATH; + state->hdr = (struct fs_state_header *)malloc(sizeof(struct fs_state_header)); + state->h_hdr = (struct host_state_header *)malloc(sizeof(struct host_state_header)); + state->cb_hdr = (struct callback_state_header *)malloc(sizeof(struct callback_state_header)); + state->cb_timeout_hdr = (struct callback_state_timeout_header *) + malloc(sizeof(struct callback_state_timeout_header)); + state->cb_fehash_hdr = (struct callback_state_fehash_header *) + malloc(sizeof(struct callback_state_fehash_header)); + if ((state->hdr == NULL) || (state->h_hdr == NULL) || (state->cb_hdr == NULL) || + (state->cb_timeout_hdr == NULL) || (state->cb_fehash_hdr == NULL)) + ret = 1; + return ret; +} + +static int +fs_stateFree(struct fs_dump_state * state) +{ + if (state->hdr) + free(state->hdr); + if (state->h_hdr) + free(state->h_hdr); + if (state->cb_hdr) + free(state->cb_hdr); + if (state->cb_timeout_hdr) + free(state->cb_timeout_hdr); + if (state->cb_fehash_hdr) + free(state->cb_fehash_hdr); + if (state->h_map.entries) + free(state->h_map.entries); + if (state->fe_map.entries) + free(state->fe_map.entries); + if (state->cb_map.entries) + free(state->cb_map.entries); + return 0; +} + +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/tviced/serialize_state.h b/src/tviced/serialize_state.h new file mode 100644 index 0000000000..c1a08c08ca --- /dev/null +++ b/src/tviced/serialize_state.h @@ -0,0 +1,311 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * demand attach fs + * fileserver state serialization + */ + +#ifndef _AFS_TVICED_SERIALIZE_STATE_H +#define _AFS_TVICED_SERIALIZE_STATE_H + +#ifdef AFS_DEMAND_ATTACH_FS + +#define FS_STATE_MAGIC 0x62FA841C +#define FS_STATE_VERSION 2 + +#define HOST_STATE_MAGIC 0x7B8C9DAE +#define HOST_STATE_VERSION 2 + +#define HOST_STATE_ENTRY_MAGIC 0xA8B9CADB + +#define CALLBACK_STATE_MAGIC 0x89DE67BC +#define CALLBACK_STATE_VERSION 1 + +#define CALLBACK_STATE_TIMEOUT_MAGIC 0x99DD5511 +#define CALLBACK_STATE_FEHASH_MAGIC 0x77BB33FF +#define CALLBACK_STATE_ENTRY_MAGIC 0x54637281 + +#define ACTIVE_VOLUME_STATE_MAGIC 0xAC7557CA +#define ACTIVE_VOLUME_STATE_VERSION 1 + +#define ACTIVE_VOLUME_STATE_AVEHASH_MAGIC 0xBADDF00D + +#define HOST_STATE_VALID_WINDOW 1800 /* 30 minutes */ + +/* + * on-disk structures + */ +struct disk_version_stamp { + afs_uint32 magic; + afs_uint32 version; +}; + +/* 1024 byte header structure */ +struct fs_state_header { + struct disk_version_stamp stamp; /* version stamp */ + afs_uint32 timestamp; /* timestamp of save */ + afs_uint32 sys_name; /* sys name id for this machine */ + afsUUID server_uuid; /* server's UUID */ + byte valid; /* whether header contents are valid */ + byte endianness; /* endianness sanity check (0 for LE, 1 for BE) */ + byte stats_detailed; /* fs stats detailed sanity check */ + byte padding1[1]; /* padding */ + afs_uint32 reserved1[23]; /* for expansion */ + afs_uint64 avol_offset; /* offset of active volumes structure */ + afs_uint64 h_offset; /* offset of host_state_header structure */ + afs_uint64 cb_offset; /* offset of callback_state_header structure */ + afs_uint64 vlru_offset; /* offset of vlru state structure */ + afs_uint32 reserved2[56]; /* for expansion */ + char server_version_string[128]; /* version string from AFS_component_version_number.c */ + afs_uint32 reserved3[128]; /* for expansion */ +}; + +/* + * host package serialization + */ + +/* 256 byte header for the host state data */ +struct host_state_header { + struct disk_version_stamp stamp; /* host state version stamp */ + afs_uint32 records; /* number of stored host records */ + afs_uint32 index_max; /* max index value encountered */ + afs_uint32 reserved[60]; /* for expansion */ +}; + +/* 32 byte host entry header */ +struct host_state_entry_header { + afs_uint32 magic; /* stamp */ + afs_uint32 len; /* number of bytes in this record */ + afs_uint32 interfaces; /* number of interfaces included in record */ + afs_uint32 hcps; /* number of hcps entries in record */ + afs_uint32 reserved[4]; +}; + +/* 36 byte host entry structure */ +struct hostDiskEntry { + afs_uint32 host; /* IP address of host interface that is + * currently being used, in network + * byte order */ + afs_uint16 port; /* port address of host */ + afs_uint16 hostFlags; /* bit map */ + byte Console; /* XXXX This host is a console */ + byte hcpsfailed; /* Retry the cps call next time */ + byte hcps_valid; /* prlist_val not null */ +#if FS_STATS_DETAILED + byte InSameNetwork; /*Is host's addr in the same network as + * the File Server's? */ +#else + byte padding1[1]; /* for padding */ +#endif /* FS_STATS_DETAILED */ + afs_uint32 hcps_len; /* length of hcps */ + afs_uint32 LastCall; /* time of last call from host */ + afs_uint32 ActiveCall; /* time of any call but gettime */ + afs_uint32 cpsCall; /* time of last cps call from this host */ + afs_uint32 cblist; /* Call back list for this host */ + afs_uint32 index; /* index for correlating w/ callback dumps */ +}; + +/* + * callback package serialization + */ + +/* 512 byte header */ +struct callback_state_header { + struct disk_version_stamp stamp; /* callback state version stamp */ + afs_uint32 nFEs; /* number of FileEntry records */ + afs_uint32 nCBs; /* number of CallBack records */ + afs_uint32 fe_max; /* max FileEntry index */ + afs_uint32 cb_max; /* max CallBack index */ + afs_int32 tfirst; /* first valid timeout */ + afs_uint32 reserved[115]; /* for expansion */ + afs_uint64 timeout_offset; /* offset of timeout queue heads */ + afs_uint64 fehash_offset; /* offset of file entry hash buckets */ + afs_uint64 fe_offset; /* offset of first file entry */ +}; + +/* 32 byte header */ +struct callback_state_timeout_header { + afs_uint32 magic; /* magic number for timeout header */ + afs_uint32 len; /* total length of header and timeout records */ + afs_uint32 records; /* number of timeout records */ + afs_uint32 reserved[5]; +}; + +/* 32 byte header */ +struct callback_state_fehash_header { + afs_uint32 magic; /* magic number for fehash header */ + afs_uint32 len; /* total length of header and fehash bucket heads */ + afs_uint32 records; /* number of hash buckets */ + afs_uint32 reserved[5]; +}; + +/* 32 byte header */ +struct callback_state_entry_header { + afs_uint32 magic; /* magic number for FE entry */ + afs_uint32 len; /* number of bytes in this record */ + afs_uint32 nCBs; /* number of callbacks for this FE */ + afs_uint32 reserved[5]; +}; + +struct FEDiskEntry { + struct FileEntry fe; + afs_uint32 index; +}; + +struct CBDiskEntry { + struct CallBack cb; + afs_uint32 index; +}; + +/* + * active volumes state serialization + * + * these structures are meant to support + * automated salvaging of active volumes + * in the event of a fileserver crash + */ + +/* 512 byte header */ +struct active_volume_state_header { + struct disk_version_stamp stamp; /* callback state version stamp */ + afs_uint32 nAVEs; /* number of ActiveVolumeEntry records */ + afs_uint32 init_timestamp; /* timestamp of AVE initialization */ + afs_uint32 update_timetamp; /* timestamp of last AVE update */ + afs_uint32 reserved[119]; /* for expansion */ + afs_uint64 avehash_offset; /* offset of active volume entry hash buckets */ + afs_uint64 ave_offset; /* offset of first active volume entry */ +}; + +/* 32 byte header */ +struct active_volume_state_avehash_header { + afs_uint32 magic; /* magic number for avehash header */ + afs_uint32 len; /* total length of header and avehash bucket heads */ + afs_uint32 records; /* number of hash buckets */ + afs_uint32 reserved[5]; +}; + +typedef afs_uint32 active_volume_state_avehash_entry; + +/* active volume entry */ +struct AVDiskEntry { + afs_uint32 volume; + afs_uint32 partition; + afs_uint32 hash_next; +}; + + +/* + * dump runtime state + */ +struct idx_map_entry_t { + afs_uint32 old_idx; /* host hash id from last runtime */ + afs_uint32 new_idx; /* host hash id for this runtime */ +}; + + +/* verification process sanity check constants + * + * make them fairly large so we don't get + * false positives + */ +#define FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN 100000 /* max elements in a host uuid-hash chain */ +#define FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN 2000000 /* max elements in a host ipv4-hash chain */ +#define FS_STATE_FE_MAX_HASH_CHAIN_LEN 100000 /* max elements in a FE fid-hash chain */ +#define FS_STATE_FCB_MAX_LIST_LEN 100000 /* max elements in a per-FE CB list */ +#define FS_STATE_HCB_MAX_LIST_LEN 100000 /* max elements in a per-host CB list */ +#define FS_STATE_TCB_MAX_LIST_LEN 100000 /* max elements in a per-timeout CB list */ + + +/* + * main state serialization state structure + */ + +struct fs_dump_state { + enum { + FS_STATE_DUMP_MODE, + FS_STATE_LOAD_MODE + } mode; + struct { + byte do_host_restore; /* whether host restore should be done */ + byte some_steps_skipped; /* whether some steps were skipped */ + byte warnings_generated; /* whether any warnings were generated during restore */ + } flags; + afs_fsize_t file_len; + int fd; /* fd of the current dump file */ + int bail; /* non-zero if something went wrong */ + char * fn; /* name of the current dump file */ + struct { /* memory map of dump file */ + void * map; + void * cursor; + afs_foff_t offset; + afs_fsize_t size; + } mmap; + struct fs_state_header * hdr; /* main header */ + struct host_state_header * h_hdr; /* header for host state data */ + struct callback_state_header * cb_hdr; /* header for callback state data */ + struct callback_state_timeout_header * cb_timeout_hdr; + struct callback_state_fehash_header * cb_fehash_hdr; + afs_uint64 eof_offset; /* current end of file offset */ + struct { + int len; /* number of host entries in map */ + struct idx_map_entry_t * entries; + } h_map; + struct { + int len; + struct idx_map_entry_t * entries; + } fe_map; + struct { + int len; + struct idx_map_entry_t * entries; + } cb_map; +}; + + +/* prototypes */ + +/* serialize_state.c */ +extern int fs_stateWrite(struct fs_dump_state * state, + void * buf, size_t len); +extern int fs_stateRead(struct fs_dump_state * state, + void * buf, size_t len); +extern int fs_stateWriteV(struct fs_dump_state * state, + struct iovec * iov, int niov); +extern int fs_stateReadV(struct fs_dump_state * state, + struct iovec * iov, int niov); +extern int fs_stateSync(struct fs_dump_state * state); +extern int fs_stateWriteHeader(struct fs_dump_state * state, + afs_uint64 * offset, + void * hdr, size_t len); +extern int fs_stateReadHeader(struct fs_dump_state * state, + afs_uint64 * offset, + void * hdr, size_t len); +extern int fs_stateIncEOF(struct fs_dump_state * state, + afs_int32 len); +extern int fs_stateSeek(struct fs_dump_state * state, + afs_uint64 * offset); + +/* host.c */ +extern int h_stateSave(struct fs_dump_state * state); +extern int h_stateRestore(struct fs_dump_state * state); +extern int h_stateRestoreIndices(struct fs_dump_state * state); +extern int h_stateVerify(struct fs_dump_state * state); +extern int h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new); + +/* callback.c */ +extern int cb_stateSave(struct fs_dump_state * state); +extern int cb_stateRestore(struct fs_dump_state * state); +extern int cb_stateRestoreIndices(struct fs_dump_state * state); +extern int cb_stateVerify(struct fs_dump_state * state); +extern int cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host); +extern int fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new); +extern int cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new); + +#endif /* AFS_DEMAND_ATTACH_FS */ +#endif /* _AFS_TVICED_SERIALIZE_STATE_H */ diff --git a/src/tviced/state_analyzer.c b/src/tviced/state_analyzer.c new file mode 100644 index 0000000000..ae8c3ff7ad --- /dev/null +++ b/src/tviced/state_analyzer.c @@ -0,0 +1,2004 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * demand attach fs + * fileserver state serialization + * + * state analyzer + */ + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef AFS_ATHENA_STDENV +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include "../util/afsutil_prototypes.h" +#include "../viced/viced.h" +#include "../viced/host.h" +#include "../viced/callback.h" +#include "serialize_state.h" +#include +#include +#include +#include +#include + +/*@+fcnmacros +macrofcndecl@*/ +#ifdef O_LARGEFILE +#ifdef S_SPLINT_S +extern off64_t afs_lseek(int FD, off64_t O, int F); +#endif /*S_SPLINT_S */ +#define afs_lseek(FD, O, F) lseek64(FD, (off64_t)(O), F) +#define afs_stat stat64 +#define afs_fstat fstat64 +#define afs_open open64 +#define afs_fopen fopen64 +#define afs_mmap mmap64 +#ifdef AFS_AIX_ENV +extern void * mmap64(); /* ugly hack since aix build env appears to be somewhat broken */ +#endif +#else /* !O_LARGEFILE */ +#ifdef S_SPLINT_S +extern off_t afs_lseek(int FD, off_t O, int F); +#endif /*S_SPLINT_S */ +#define afs_lseek(FD, O, F) lseek(FD, (off_t)(O), F) +#define afs_stat stat +#define afs_fstat fstat +#define afs_open open +#define afs_fopen fopen +#define afs_mmap mmap +#endif /* !O_LARGEFILE */ +/*@=fcnmacros =macrofcndecl@*/ + + +#ifndef AFS_DEMAND_ATTACH_FS +int +main (int argc, char ** argv) +{ + fprintf(stderr, "%s is only supported for demand attach fileservers\n", + argv[0] ? argv[0] : "state analyzer"); + return 1; +} +#else /* AFS_DEMAND_ATTACH_FS */ + +static void usage(char * prog); +static int openFile(char * path); +static void initState(void); + +static void banner(void); +static void prompt(void); + +static void print_help(void); +static void print_global_help(void); +static void print_h_help(void); +static void print_fe_help(void); +static void print_cb_help(void); + +static void dump_hdr(void); +static void dump_h_hdr(void); +static void dump_cb_hdr(void); + +static void dump_cb_timeout(void); +static void dump_cb_fehash(void); + +static void dump_all_hes(void); +static void dump_all_fes(void); +static void dump_all_cbs(void); + +static void dump_he(afs_uint32 idx); +static void dump_fe(afs_uint32 idx); +static void dump_cb(afs_uint32 idx); +static void dump_this_he(void); +static void dump_this_fe(void); +static void dump_this_cb(void); +static void dump_next_he(void); +static void dump_next_fe(void); +static void dump_next_cb(void); +static void dump_prev_he(void); +static void dump_prev_fe(void); +static void dump_prev_cb(void); +static void dump_first_he(void); +static void dump_first_fe(void); +static void dump_first_cb(void); +static void dump_last_he(void); +static void dump_last_fe(void); +static void dump_last_cb(void); +static void dump_he_hdr(void); +static void dump_he_entry(void); +static void dump_he_interfaces(void); +static void dump_he_hcps(void); +static void dump_fe_hdr(void); +static void dump_fe_entry(void); +static void dump_cb_entry(void); + +static void hexdump_map(afs_uint32 offset, afs_uint32 len); + +static int get_hdr(void); +static int get_h_hdr(void); +static int get_cb_hdr(void); +static int get_cb_timeout_hdr(void); +static int get_cb_timeout(void); +static int get_cb_fehash_hdr(void); +static int get_cb_fehash(void); +static int get_he(afs_uint32 idx); +static int get_he_hdr(void); +static int get_he_entry(void); +static int get_fe(afs_uint32 idx); +static int get_fe_hdr(void); +static int get_fe_entry(void); +static int get_cb(afs_uint32 idx); +static int get_cb_entry(void); + +static int find_fe_by_index(afs_uint32 idx); +static int find_cb_by_index(afs_uint32 idx); +static int find_fe_by_fid(afs_uint32 vol, afs_uint32 vn, afs_uint32 uniq); + + +static int dump_fd = -1; +static void * map = NULL; +static size_t map_len; + +static struct { + struct fs_state_header hdr; + struct host_state_header h_hdr; + struct callback_state_header cb_hdr; + struct callback_state_timeout_header timeout_hdr; + struct callback_state_fehash_header fehash_hdr; + afs_uint32 * timeout; + afs_uint32 * fehash; + + /* pointers into the memory map */ + void * hdr_p; + void * h_hdr_p; + void * cb_hdr_p; + void * timeout_hdr_p; + void * timeout_p; + void * fehash_hdr_p; + void * fehash_p; + + byte hdr_valid; + byte h_hdr_valid; + byte cb_hdr_valid; + byte timeout_hdr_valid; + byte fehash_hdr_valid; +} hdrs; + +static struct { + void * fh; + void * cursor; + void * ifp; + void * hcps; + struct host_state_entry_header hdr; + struct hostDiskEntry he; + afs_uint32 idx; + byte hdr_valid; + byte he_valid; +} he_cursor; + +static struct { + void ** cursor; +} he_cache; + +static struct { + void * ffe; + void * cursor; + void * fcb; + struct callback_state_entry_header hdr; + struct FEDiskEntry fe; + afs_uint32 idx; + byte hdr_valid; + byte fe_valid; +} fe_cursor; + +static struct { + void ** cursor; +} fe_cache; + +static struct { + void * cursor; + struct CBDiskEntry cb; + afs_uint32 idx; + byte cb_valid; +} cb_cursor; + +static struct { + void ** cursor; +} cb_cache; + +static void +usage(char * prog) +{ + fprintf(stderr, "usage: %s []\n"); +} + +int +main(int argc, char ** argv) +{ + banner(); + + if (argc > 2 || (argc == 2 && !strcmp(argv[1], "-h"))) { + usage(argv[0]); + return 1; + } + + initState(); + + if (argc > 1) { + if (openFile(argv[1])) + return 1; + } else { + if (openFile(AFSDIR_SERVER_FSSTATE_FILEPATH)) + return 1; + } + + prompt(); + return 0; +} + + +static int +openFile(char * path) +{ + int ret = 0; + struct afs_stat status; + + dump_fd = afs_open(path, O_RDWR); + if (dump_fd == -1) { + fprintf(stderr, "dump file '%s' failed to open\n", path); + ret = 1; + goto done; + } + + printf("opened dump file '%s'\n", path); + + if (afs_fstat(dump_fd, &status) == -1) { + fprintf(stderr, "failed to stat file\n"); + ret = 1; + goto done; + } + + map_len = status.st_size; + + map = afs_mmap(NULL, map_len, PROT_READ, MAP_SHARED, dump_fd, 0); + if (map == MAP_FAILED) { + fprintf(stderr, "failed to mmap file\n"); + ret = 1; + goto done; + } + + printf("mapped %d bytes at 0x%x\n", map_len, map); + + done: + if (ret) { + if (map) { + munmap(map, map_len); + map = NULL; + } + if (dump_fd != -1) { + close(dump_fd); + dump_fd = -1; + } + } + return ret; +} + +static void +initState(void) +{ + hdrs.hdr_valid = hdrs.h_hdr_valid = hdrs.cb_hdr_valid = 0; + he_cursor.cursor = fe_cursor.cursor = cb_cursor.cursor = NULL; + he_cursor.fh = fe_cursor.ffe = fe_cursor.fcb = NULL; + he_cache.cursor = fe_cache.cursor = NULL; +} + +static void +banner(void) +{ + fprintf(stderr, "demand attach fs\n"); + fprintf(stderr, "fileserver state analyzer\n"); + fprintf(stderr, "version 0.1\n"); +} + +#define PROGNAME "fs state analyzer" + +static void +prompt(void) +{ + char input[256]; + char prev_input[256]; + char * tok = NULL; + afs_uint32 x, y, z; + enum { + PR_GLOBAL_MODE, + PR_H_MODE, + PR_FE_MODE, + PR_CB_MODE + } mode = PR_GLOBAL_MODE, next_mode; + + next_mode = mode; + input[0] = prev_input[0] = '\0'; + + while (1) { + if (!tok) { + switch(mode) { + case PR_GLOBAL_MODE: + printf(PROGNAME "> "); + break; + case PR_H_MODE: + printf(PROGNAME ": h(%d)> ", he_cursor.idx); + break; + case PR_FE_MODE: + printf(PROGNAME ": fe(%d)> ", fe_cursor.idx); + break; + case PR_CB_MODE: + printf(PROGNAME ": fe(%d):cb(%d)> ", fe_cursor.idx, cb_cursor.idx); + break; + default: + fprintf(stderr, "prompt state broken; aborting\n"); + return; + } + gets(input); + + if (!strcmp(input, "")) { + /* repeat last command */ + if (!strcmp(prev_input, "")) { + continue; + } + strlcpy(input, prev_input, sizeof(input)); + } else { + /* save command for repetition */ + strlcpy(prev_input, input, sizeof(prev_input)); + } + + tok = strtok(input, " \t"); + } + while (tok && !strcmp(tok, ";")) { + tok = strtok(NULL, "; \t"); + } + + if (!tok) { + continue; + } + + if (!strcasecmp(tok, "exit")) { + return; + } else if (!strcasecmp(tok, "quit")) { + switch(mode) { + case PR_CB_MODE: + next_mode = PR_FE_MODE; + break; + case PR_FE_MODE: + case PR_H_MODE: + next_mode = PR_GLOBAL_MODE; + break; + default: + return; + } + } else if (!strcasecmp(tok, "h")) { + tok = strtok(NULL, " \t"); + mode = PR_H_MODE; + if (!tok) { + next_mode = mode; + } + continue; + } else if (!strcasecmp(tok, "fe")) { + tok = strtok(NULL, " \t"); + mode = PR_FE_MODE; + if (!tok) { + next_mode = mode; + } + continue; + } else if (!strcasecmp(tok, "fs")) { + tok = strtok(NULL, " \t"); + mode = PR_GLOBAL_MODE; + if (!tok) { + next_mode = mode; + } + continue; + } else if (!strcasecmp(tok, "cb")) { + tok = strtok(NULL, " \t"); + mode = PR_CB_MODE; + if (!tok) { + next_mode = mode; + } + continue; + } else if (!strcasecmp(tok, "help")) { + switch(mode) { + case PR_H_MODE: + print_h_help(); + break; + case PR_FE_MODE: + print_fe_help(); + break; + case PR_CB_MODE: + print_cb_help(); + break; + default: + print_global_help(); + } + print_help(); + } else if (!strcasecmp(tok, "hexdump")) { + tok = strtok(NULL, " \t"); + if (!tok) { + hexdump_map(0, map_len); + continue; + } + if (sscanf(tok, "%u", &x) != 1) { + fprintf(stderr, "hexdump parse error 1\n"); + tok = NULL; + continue; + } + tok = strtok(NULL, " \t"); + if (!tok) { + hexdump_map(x, map_len - x); + continue; + } + if (sscanf(tok, "%u", &y) != 1) { + fprintf(stderr, "hexdump parse error 2\n"); + continue; + } + hexdump_map(x,y); + } else if (!strcasecmp(tok, "hdr")) { + switch(mode) { + case PR_H_MODE: + dump_h_hdr(); + break; + case PR_FE_MODE: + dump_cb_hdr(); + break; + case PR_CB_MODE: + dump_this_fe(); + break; + default: + dump_hdr(); + } + } else if (!strcasecmp(tok, "this")) { + switch(mode) { + case PR_H_MODE: + dump_this_he(); + break; + case PR_FE_MODE: + dump_this_fe(); + break; + case PR_CB_MODE: + dump_this_cb(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "next")) { + switch(mode) { + case PR_H_MODE: + dump_next_he(); + break; + case PR_FE_MODE: + dump_next_fe(); + break; + case PR_CB_MODE: + dump_next_cb(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "prev")) { + switch(mode) { + case PR_H_MODE: + dump_prev_he(); + break; + case PR_FE_MODE: + dump_prev_fe(); + break; + case PR_CB_MODE: + dump_prev_cb(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "first")) { + switch(mode) { + case PR_H_MODE: + dump_first_he(); + break; + case PR_FE_MODE: + dump_first_fe(); + break; + case PR_CB_MODE: + dump_first_cb(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "last")) { + switch(mode) { + case PR_H_MODE: + dump_last_he(); + break; + case PR_FE_MODE: + dump_last_fe(); + break; + case PR_CB_MODE: + dump_last_cb(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "dump")) { + switch(mode) { + case PR_H_MODE: + dump_all_hes(); + break; + case PR_FE_MODE: + dump_all_fes(); + break; + case PR_CB_MODE: + dump_all_cbs(); + break; + default: + fprintf(stderr, "command not valid for this mode\n"); + } + } else if (!strcasecmp(tok, "find")) { + tok = strtok(NULL, " \t"); + if (!tok || strcasecmp(tok, "by")) { + tok = NULL; + fprintf(stderr, "find syntax error 1 (%s)\n", + (tok) ? tok : "nil"); + continue; + } + tok = strtok(NULL, " \t"); + if (!tok) { + fprintf(stderr, "find syntax error 2\n"); + continue; + } + switch(mode) { + case PR_H_MODE: + fprintf(stderr, "not implemented yet\n"); + break; + case PR_FE_MODE: + if (!strcasecmp(tok, "index")) { + tok = strtok(NULL, " \t"); + if (!tok || sscanf(tok, "%u", &x) != 1) { + tok = NULL; + fprintf(stderr, "find syntax error 3\n"); + continue; + } + if (find_fe_by_index(x)) { + fprintf(stderr, "find returned no results\n"); + } + } else if (!strcasecmp(tok, "fid")) { + tok = strtok(NULL, "(), \t"); + if (!tok || sscanf(tok, "%u", &x) != 1) { + tok = NULL; + fprintf(stderr, "find syntax error 4\n"); + continue; + } + tok = strtok(NULL, "(), \t"); + if (!tok || sscanf(tok, "%u", &y) != 1) { + tok = NULL; + fprintf(stderr, "find syntax error 5\n"); + continue; + } + tok = strtok(NULL, "(), \t"); + if (!tok || sscanf(tok, "%u", &z) != 1) { + tok = NULL; + fprintf(stderr, "find syntax error 6\n"); + continue; + } + if (find_fe_by_fid(x,y,z)) { + fprintf(stderr, "find returned no results\n"); + } + } else { + fprintf(stderr, "unsupported filter type\n"); + } + break; + case PR_CB_MODE: + if (!strcasecmp(tok, "index")) { + tok = strtok(NULL, " \t"); + if (!tok || sscanf(tok, "%u", &x) != 1) { + tok = NULL; + fprintf(stderr, "find syntax error 3\n"); + continue; + } + if (find_cb_by_index(x)) { + fprintf(stderr, "find returned no results\n"); + } + } else { + fprintf(stderr, "unsupported filter type\n"); + } + break; + default: + fprintf(stderr, "find not supported for this menu\n"); + } + } else if (!strcspn(tok, "0123456789")) { + if (sscanf(tok, "%u", &x) == 1) { + switch(mode) { + case PR_H_MODE: + dump_he(x); + break; + case PR_FE_MODE: + dump_fe(x); + break; + case PR_CB_MODE: + dump_cb(x); + break; + default: + fprintf(stderr, "command not available from this menu\n"); + } + } else { + fprintf(stderr, "input parse error ('%s')\n", tok); + } + } else if (mode == PR_FE_MODE) { + if (!strcmp(tok, "timeout")) { + dump_cb_timeout(); + } else if (!strcmp(tok, "hash")) { + dump_cb_fehash(); + } + } else { + fprintf(stderr, "unknown command\n"); + } + tok = strtok(NULL, " \t"); + mode = next_mode; + } +} + +static void +print_help(void) +{ + printf("\th <...> -- host menu commands\n"); + printf("\tfe <...> -- FileEntry menu commands\n"); + printf("\tcb <...> -- CallBack menu commands\n"); + printf("\thexdump [ []]\n\t\t -- hex dump the raw data\n"); + printf("\tquit -- quit this menu\n"); + printf("\texit -- exit the debugger\n"); + printf("\thelp -- this help message\n"); +} + +static void +print_global_help(void) +{ + printf("\thdr -- display the fs_state_header struct\n"); +} + +static void +print_h_help(void) +{ + printf("\thdr -- display the host_state_header struct\n"); + printf("\tfirst -- display the first host\n"); + printf("\tprev -- display the previous host\n"); + printf("\tthis -- display this host\n"); + printf("\tnext -- display the next host\n"); + printf("\tlast -- display the last host\n"); + printf("\tdump -- display all hosts\n"); +} + +static void +print_fe_help(void) +{ + printf("\thdr -- display the callback_state_header struct\n"); + printf("\tfirst -- display the first FE\n"); + printf("\tprev -- display the previous FE\n"); + printf("\tthis -- display this FE\n"); + printf("\tnext -- display the next FE\n"); + printf("\tlast -- display the last FE\n"); + printf("\tdump -- display all FEs\n"); + printf("\ttimeout -- display the timeout queue heads\n"); + printf("\thash -- display the file entry hash buckets\n"); + printf("\tfind by index \n\t\t -- find an fe by its array index\n"); + printf("\tfind by fid <(vol,vnode,unique)>\n\t\t -- find an fe by its AFSFid\n"); +} + +static void +print_cb_help(void) +{ + printf("\thdr -- display the callback_state_entry_header struct\n"); + printf("\tfirst -- display the first CB\n"); + printf("\tprev -- display the previous CB\n"); + printf("\tthis -- display this CB\n"); + printf("\tnext -- display the next CB\n"); + printf("\tlast -- display the last CB\n"); + printf("\tdump -- display all CBs\n"); +} + +#define DPFTB0 "\t" +#define DPFTB1 "\t\t" +#define DPFTB2 "\t\t\t" + +#define DPFOFF(addr) \ + do { \ + char * _p = (char *)addr; \ + char * _m = (char *)map; \ + printf("loading structure from address 0x%x (offset %u)\n", \ + addr, _p-_m); \ + } while (0) + +/* structs */ +#define DPFSO(T, name) printf(T "%s = {\n", name) +#define DPFSO0(name) DPFSO(DPFTB0, name) +#define DPFSO1(name) DPFSO(DPFTB1, name) +#define DPFSC(T) printf(T "}\n") +#define DPFSC0 DPFSC(DPFTB0) +#define DPFSC1 DPFSC(DPFTB1) + +/* arrays */ +#define DPFAO(T1, T2, name) printf(T1 "%s =\n" T2 "{ ", name) +#define DPFAO0(name) DPFAO(DPFTB0, DPFTB1, name) +#define DPFAO1(name) DPFAO(DPFTB1, DPFTB2, name) +#define DPFAC0 printf(" }\n") +#define DPFAC1 DPFAC0 +#define DPFA1 printf(DPFTB1 " ") +#define DPFA2 printf(DPFTB2 " ") +#define DPFAN printf("\n") +#define DPFALE(type, var) printf("%" type, var) +#define DPFAE(type, var) printf("%" type ",\t", var) + +/* normal vars */ +#define DPFV(T, name, type, var) printf(T "%s = %" type "\n", name, var) +#define DPFV1(name, type, var) DPFV(DPFTB1, name, type, var) +#define DPFV2(name, type, var) DPFV(DPFTB2, name, type, var) + +/* hex */ +#define DPFX(T, name, var) printf(T "%s = 0x%x\n", name, var) +#define DPFX1(name, var) DPFX(DPFTB1, name, var) +#define DPFX2(name, var) DPFX(DPFTB2, name, var) + +/* strings */ +#define DPFS(T, name, var) printf(T "%s = \"%s\"\n", name, var) +#define DPFS1(name, var) DPFS(DPFTB1, name, var) +#define DPFS2(name, var) DPFS(DPFTB2, name, var) + +/* time */ +#define DPFT(T, name, var) \ + do { \ + char * last; \ + printf(T "%s = \"%s\"\n", name, strtok_r(ctime(&(var)), "\r\n", &last)); \ + } while(0) +#define DPFT1(name, var) DPFT(DPFTB1, name, var) +#define DPFT2(name, var) DPFT(DPFTB2, name, var) + +static void +dump_hdr(void) +{ + char uuid_str[40]; + afs_uint32 hi, lo; + + if (get_hdr()) + return; + + DPFOFF(map); + DPFSO0("fs_state_header"); + DPFSO1("stamp"); + DPFX2("magic", hdrs.hdr.stamp.magic); + DPFV2("version", "u", hdrs.hdr.stamp.version); + DPFSC1; + DPFT1("timestamp", hdrs.hdr.timestamp); + DPFV1("sys_name", "u", hdrs.hdr.sys_name); + + afsUUID_to_string(&hdrs.hdr.server_uuid, uuid_str, sizeof(uuid_str)); + DPFS1("server_uuid", uuid_str); + DPFV1("valid", "d", hdrs.hdr.valid); + DPFV1("endianness", "d", hdrs.hdr.endianness); + DPFV1("stats_detailed", "d", hdrs.hdr.stats_detailed); + + SplitInt64(hdrs.hdr.h_offset, hi, lo); + DPFSO1("h_offset"); + DPFV2("hi", "u", hi); + DPFV2("lo", "u", lo); + DPFSC1; + + SplitInt64(hdrs.hdr.cb_offset, hi, lo); + DPFSO1("cb_offset"); + DPFV2("hi", "u", hi); + DPFV2("lo", "u", lo); + DPFSC1; + + DPFS1("server_version_string", hdrs.hdr.server_version_string); + DPFSC0; + + if (hdrs.hdr.stamp.magic != FS_STATE_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } + if (hdrs.hdr.stamp.version != FS_STATE_VERSION) { + fprintf(stderr, "* version check failed\n"); + } +} + +static void +dump_h_hdr(void) +{ + if (get_h_hdr()) + return; + + DPFOFF(hdrs.h_hdr_p); + DPFSO0("host_state_header"); + DPFSO1("stamp"); + DPFX2("magic", hdrs.h_hdr.stamp.magic); + DPFV2("version", "u", hdrs.h_hdr.stamp.version); + DPFSC1; + DPFV1("records", "u", hdrs.h_hdr.records); + DPFV1("index_max", "u", hdrs.h_hdr.index_max); + DPFSC0; + + if (hdrs.h_hdr.stamp.magic != HOST_STATE_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } + if (hdrs.h_hdr.stamp.version != HOST_STATE_VERSION) { + fprintf(stderr, "* version check failed\n"); + } +} + +static void +dump_cb_hdr(void) +{ + afs_uint32 hi, lo; + + if (get_cb_hdr()) + return; + + DPFOFF(hdrs.cb_hdr_p); + DPFSO0("callback_state_header"); + DPFSO1("stamp"); + DPFX2("magic", hdrs.cb_hdr.stamp.magic); + DPFV2("version", "u", hdrs.cb_hdr.stamp.version); + DPFSC1; + DPFV1("nFEs", "u", hdrs.cb_hdr.nFEs); + DPFV1("nCBs", "u", hdrs.cb_hdr.nCBs); + DPFV1("fe_max", "u", hdrs.cb_hdr.fe_max); + DPFV1("cb_max", "u", hdrs.cb_hdr.cb_max); + DPFV1("tfirst", "d", hdrs.cb_hdr.tfirst); + + SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo); + DPFSO1("timeout_offset"); + DPFV2("hi", "u", hi); + DPFV2("lo", "u", lo); + DPFSC1; + + SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo); + DPFSO1("fehash_offset"); + DPFV2("hi", "u", hi); + DPFV2("lo", "u", lo); + DPFSC1; + + SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo); + DPFSO1("fe_offset"); + DPFV2("hi", "u", hi); + DPFV2("lo", "u", lo); + DPFSC1; + + DPFSC0; + + if (hdrs.cb_hdr.stamp.magic != CALLBACK_STATE_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } + if (hdrs.cb_hdr.stamp.version != CALLBACK_STATE_VERSION) { + fprintf(stderr, "* version check failed\n"); + } +} + +static void +dump_cb_timeout(void) +{ + int i; + + if (get_cb_hdr()) + return; + + if (get_cb_timeout_hdr()) + return; + + if (get_cb_timeout()) + return; + + DPFOFF(hdrs.timeout_hdr_p); + DPFSO0("callback_state_timeout_header"); + DPFX1("magic", hdrs.timeout_hdr.magic); + DPFV1("len", "u", hdrs.timeout_hdr.len); + DPFV1("records", "u", hdrs.timeout_hdr.records); + DPFSC0; + + if (hdrs.timeout_hdr.magic != CALLBACK_STATE_TIMEOUT_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } + + DPFOFF(hdrs.timeout_p); + DPFAO0("timeout"); + for (i = 0; i < 127; i++) { + DPFAE("u", hdrs.timeout[i]); + if ((i % 8) == 7) { + DPFAN; + DPFA1; + } + } + DPFALE("u", hdrs.timeout[127]); + DPFAC0; +} + +static void +dump_cb_fehash(void) +{ + int i; + + if (get_cb_hdr()) + return; + + if (get_cb_fehash_hdr()) + return; + + if (get_cb_fehash()) + return; + + DPFOFF(hdrs.fehash_hdr_p); + DPFSO0("callback_state_fehash_header"); + DPFX1("magic", hdrs.fehash_hdr.magic); + DPFV1("len", "u", hdrs.fehash_hdr.len); + DPFV1("records", "u", hdrs.fehash_hdr.records); + DPFSC0; + + if (hdrs.fehash_hdr.magic != CALLBACK_STATE_FEHASH_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } + + DPFOFF(hdrs.fehash_p); + DPFAO0("fehash"); + for (i = 0; i < hdrs.fehash_hdr.records - 1; i++) { + DPFAE("u", hdrs.fehash[i]); + if ((i % 8) == 7) { + DPFAN; + DPFA1; + } + } + DPFALE("u", hdrs.fehash[hdrs.fehash_hdr.records-1]); + DPFAC0; +} + +static void +dump_all_hes(void) +{ + int i; + + if (get_h_hdr()) { + fprintf(stderr, "error getting host_state_header\n"); + return; + } + + for (i = 0; i < hdrs.h_hdr.records; i++) { + dump_he(i); + } +} + +static void +dump_all_fes(void) +{ + int i; + + if (get_cb_hdr()) { + fprintf(stderr, "error getting callback_state_header\n"); + return; + } + + for (i = 0; i < hdrs.cb_hdr.nFEs; i++) { + dump_fe(i); + } +} + +static void +dump_all_cbs(void) +{ + int i; + + if (get_fe_hdr()) { + fprintf(stderr, "error getting callback_state_entry_header\n"); + return; + } + + for (i = 0; i < fe_cursor.hdr.nCBs; i++) { + dump_cb(i); + } +} + +static void +dump_he(afs_uint32 idx) +{ + if (get_he(idx)) { + fprintf(stderr, "error getting he %d\n", idx); + return; + } + + DPFOFF(he_cursor.cursor); + dump_he_hdr(); + dump_he_entry(); + dump_he_interfaces(); + dump_he_hcps(); +} + +static void +dump_fe(afs_uint32 idx) +{ + if (get_fe(idx)) { + fprintf(stderr, "error getting fe %d\n", idx); + return; + } + + DPFOFF(fe_cursor.cursor); + dump_fe_hdr(); + dump_fe_entry(); +} + +static void +dump_cb(afs_uint32 idx) +{ + if (get_cb(idx)) { + fprintf(stderr, "error getting cb %d\n", idx); + return; + } + + DPFOFF(cb_cursor.cursor); + dump_cb_entry(); +} + +static void +dump_this_he(void) +{ + dump_he(he_cursor.idx); +} + +static void +dump_this_fe(void) +{ + dump_fe(fe_cursor.idx); +} + +static void +dump_this_cb(void) +{ + dump_cb(cb_cursor.idx); +} + +static void +dump_next_he(void) +{ + if (get_h_hdr()) { + fprintf(stderr, "error getting host_state_header\n"); + return; + } + + if ((he_cursor.idx + 1) >= hdrs.h_hdr.records) { + fprintf(stderr, "no more HEs\n"); + return; + } + + dump_he(he_cursor.idx+1); +} + +static void +dump_next_fe(void) +{ + if (get_cb_hdr()) { + fprintf(stderr, "error getting callback_state_header\n"); + return; + } + + if ((fe_cursor.idx + 1) >= hdrs.cb_hdr.nFEs) { + fprintf(stderr, "no more FEs\n"); + return; + } + + dump_fe(fe_cursor.idx+1); +} + +static void +dump_next_cb(void) +{ + if (get_fe_hdr()) { + fprintf(stderr, "error getting callback_state_entry_header\n"); + return; + } + + if ((cb_cursor.idx + 1) >= fe_cursor.hdr.nCBs) { + fprintf(stderr, "no more CBs\n"); + return; + } + + dump_cb(cb_cursor.idx+1); +} + +static void +dump_prev_he(void) +{ + if (!he_cursor.idx) { + fprintf(stderr, "no more HEs\n"); + return; + } + + dump_he(he_cursor.idx-1); +} + +static void +dump_prev_fe(void) +{ + if (!fe_cursor.idx) { + fprintf(stderr, "no more FEs\n"); + return; + } + + dump_fe(fe_cursor.idx-1); +} + +static void +dump_prev_cb(void) +{ + if (!cb_cursor.idx) { + fprintf(stderr, "no more CBs\n"); + return; + } + + dump_cb(cb_cursor.idx-1); +} + +static void +dump_first_fe(void) +{ + if (get_cb_hdr()) { + fprintf(stderr, "error getting callback_state_header\n"); + return; + } + + if (!hdrs.cb_hdr.nFEs) { + fprintf(stderr, "no FEs present\n"); + return; + } + + dump_fe(0); +} + +static void +dump_first_he(void) +{ + if (get_h_hdr()) { + fprintf(stderr, "error getting host_state_header\n"); + return; + } + + if (!hdrs.h_hdr.records) { + fprintf(stderr, "no HEs present\n"); + return; + } + + dump_he(0); +} + +static void +dump_first_cb(void) +{ + if (get_fe_hdr()) { + fprintf(stderr, "error getting callback_state_entry_header\n"); + return; + } + + if (!fe_cursor.hdr.nCBs) { + fprintf(stderr, "no CBs present\n"); + return; + } + + dump_cb(0); +} + +static void +dump_last_he(void) +{ + if (get_h_hdr()) { + fprintf(stderr, "error getting host_state_header\n"); + return; + } + + if (!hdrs.h_hdr.records) { + fprintf(stderr, "no HEs present\n"); + return; + } + + dump_he(hdrs.h_hdr.records-1); +} + +static void +dump_last_fe(void) +{ + if (get_cb_hdr()) { + fprintf(stderr, "error getting callback_state_header\n"); + return; + } + + if (!hdrs.cb_hdr.nFEs) { + fprintf(stderr, "no FEs present\n"); + return; + } + + dump_fe(hdrs.cb_hdr.nFEs-1); +} + +static void +dump_last_cb(void) +{ + if (get_fe_hdr()) { + fprintf(stderr, "error getting callback_state_entry_header\n"); + return; + } + + if (!fe_cursor.hdr.nCBs) { + fprintf(stderr, "no CBs present\n"); + return; + } + + dump_cb(fe_cursor.hdr.nCBs-1); +} + +static void +dump_he_hdr(void) +{ + DPFSO0("host_state_entry_header"); + DPFX1("magic", he_cursor.hdr.magic); + DPFV1("len", "u", he_cursor.hdr.len); + DPFV1("interfaces", "u", he_cursor.hdr.interfaces); + DPFV1("hcps", "u", he_cursor.hdr.hcps); + DPFSC0; + + if (he_cursor.hdr.magic != HOST_STATE_ENTRY_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } +} + +static void +dump_he_entry(void) +{ + DPFSO0("hostDiskEntry"); + DPFS1("host", afs_inet_ntoa(he_cursor.he.host)); + DPFV1("port", "u", he_cursor.he.port); + DPFX1("hostFlags", he_cursor.he.hostFlags); + DPFV1("Console", "u", he_cursor.he.Console); + DPFV1("hcpsfailed", "u", he_cursor.he.hcpsfailed); + DPFV1("hcps_valid", "u", he_cursor.he.hcps_valid); + if (hdrs.hdr.stats_detailed) { +#ifdef FS_STATS_DETAILED + DPFV1("InSameNetwork", "u", he_cursor.he.InSameNetwork); +#else + DPFV1("InSameNetwork", "u", he_cursor.he.padding1[0]); +#endif + } + DPFV1("hcps_len", "u", he_cursor.he.hcps_len); + DPFT1("LastCall", he_cursor.he.LastCall); + DPFT1("ActiveCall", he_cursor.he.ActiveCall); + DPFT1("cpsCall", he_cursor.he.cpsCall); + DPFV1("cblist", "u", he_cursor.he.cblist); + DPFV1("index", "u", he_cursor.he.index); + DPFSC0; +} + +static void +dump_he_interfaces(void) +{ + char temp_str[40]; + struct Interface * ifp; + int len, i; + + if (!he_cursor.hdr.interfaces) + return; + + len = sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort)); + ifp = (struct Interface *) malloc(len); + assert(ifp != NULL); + + memcpy(ifp, he_cursor.ifp, len); + + DPFSO0("Interface"); + DPFV1("numberOfInterfaces", "u", ifp->numberOfInterfaces); + + afsUUID_to_string(&ifp->uuid, temp_str, sizeof(temp_str)); + DPFS1("uuid", temp_str); + for (i = 0; i < he_cursor.hdr.interfaces; i++) { + snprintf(temp_str, sizeof(temp_str), "interface[%d]", i); + DPFSO1(temp_str); + DPFS2("addr", afs_inet_ntoa(ifp->interface[i].addr)); + DPFV2("port", "u", ifp->interface[i].port); + DPFSC1; + } + + DPFSC0; + + if (he_cursor.hdr.interfaces != ifp->numberOfInterfaces) { + fprintf(stderr, "* interface count mismatch between header and Interface struct\n"); + } + free(ifp); +} + +static void +dump_he_hcps(void) +{ + char temp_str[40]; + afs_int32 * hcps; + int len, i; + + if (!he_cursor.hdr.hcps) + return; + + len = (he_cursor.hdr.hcps)*sizeof(afs_uint32); + hcps = (afs_int32 *) malloc(len); + assert(hcps != NULL); + memcpy(hcps, he_cursor.hcps, len); + + DPFSO0("hcps"); + DPFAO1("prlist_val"); + for (i = 0; i < he_cursor.hdr.hcps - 1; i++) { + DPFAE("d", hcps[i]); + if ((i % 8) == 7) { + DPFAN; + DPFA2; + } + } + DPFALE("d", hcps[he_cursor.hdr.hcps-1]); + DPFAC1; + DPFSC0; + free(hcps); +} + +static void +dump_fe_hdr(void) +{ + DPFSO0("callback_state_entry_header"); + DPFX1("magic", fe_cursor.hdr.magic); + DPFV1("len", "u", fe_cursor.hdr.len); + DPFV1("nCBs", "u", fe_cursor.hdr.nCBs); + DPFSC0; + + if (fe_cursor.hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) { + fprintf(stderr, "* magic check failed\n"); + } +} + +static void +dump_fe_entry(void) +{ + DPFSO0("FEDiskEntry"); + DPFSO1("fe"); + DPFV2("vnode", "u", fe_cursor.fe.fe.vnode); + DPFV2("unique", "u", fe_cursor.fe.fe.unique); + DPFV2("volid", "u", fe_cursor.fe.fe.volid); + DPFV2("fnext", "u", fe_cursor.fe.fe.fnext); + DPFV2("ncbs", "u", fe_cursor.fe.fe.ncbs); + DPFV2("firstcb", "u", fe_cursor.fe.fe.firstcb); + DPFV2("status", "u", fe_cursor.fe.fe.status); + DPFSC1; + DPFV1("index", "u", fe_cursor.fe.index); + DPFSC0; +} + +static void +dump_cb_entry(void) +{ + DPFSO0("CBDiskEntry"); + DPFSO1("cb"); + DPFV2("cnext", "u", cb_cursor.cb.cb.cnext); + DPFV2("fhead", "u", cb_cursor.cb.cb.fhead); + DPFV2("thead", "u", (afs_uint32)cb_cursor.cb.cb.thead); + DPFV2("status", "u", (afs_uint32)cb_cursor.cb.cb.status); + DPFV2("hhead", "u", cb_cursor.cb.cb.hhead); + DPFV2("tprev", "u", cb_cursor.cb.cb.tprev); + DPFV2("tnext", "u", cb_cursor.cb.cb.tnext); + DPFV2("hprev", "u", cb_cursor.cb.cb.hprev); + DPFV2("hnext", "u", cb_cursor.cb.cb.hnext); + DPFSC1; + DPFV1("index", "u", cb_cursor.cb.index); + DPFSC0; +} + +#define DPFHMS printf(" ") +#define DPFHS printf(" ") +#define DPFHN(offset) printf("\n%u\t", offset) +#define DPFHD(x) printf("%02X ", x) +#define DPFHE printf("\n") + +static void +hexdump_map(afs_uint32 offset, afs_uint32 len) +{ + int i; + unsigned char * p = (unsigned char *)map; + afs_uint32 c32; + + if (!len) + return; + + if ((offset + len) > map_len) { + fprintf(stderr, "offset + length exceeds memory map size (%u > %u)\n", + offset+len, map_len); + return; + } + + p += offset; + DPFOFF(p); + DPFHN(offset); + + for (i = offset % 16; i > 0; i--) { + DPFHS; + } + + for (i=0; i < len; i++, p++, offset++) { + if (!(offset % 16)) { + DPFHN(offset); + } else if (!(offset % 8)) { + DPFHMS; + } + DPFHD(*p); + } + DPFHE; +} + +static int +get_hdr(void) +{ + if (!hdrs.hdr_valid) { + if (map_len < sizeof(struct fs_state_header)) { + fprintf(stderr, "corrupt state dump: fs_state_header larger than memory map\n"); + return 1; + } + memcpy(&hdrs.hdr, map, sizeof(hdrs.hdr)); + hdrs.hdr_p = map; + hdrs.hdr_valid = 1; + } + return 0; +} + +static int +get_h_hdr(void) +{ + char * buf; + afs_uint32 hi, lo; + + if (hdrs.h_hdr_valid) + return 0; + + if (get_hdr()) + return 1; + + SplitInt64(hdrs.hdr.h_offset, hi, lo); + + if (hi) { + fprintf(stderr, "hi offset bits set in h_offset; can't get host_state_header\n"); + return 1; + } + if ((lo >= map_len) || + ((lo + sizeof(struct host_state_header)) > map_len) || + (lo + sizeof(struct host_state_header) < lo)) { + fprintf(stderr, "h_offset puts host_state_header beyond end of memory map\n"); + return 1; + } + + buf = (char *) map; + buf += lo; + memcpy(&hdrs.h_hdr, buf, sizeof(struct host_state_header)); + hdrs.h_hdr_p = buf; + buf += sizeof(struct host_state_header); + he_cursor.fh = (void *)buf; + return 0; +} + +static int +get_cb_hdr(void) +{ + char * buf; + afs_uint32 hi, lo; + + if (hdrs.cb_hdr_valid) + return 0; + + if (get_hdr()) + return 1; + + SplitInt64(hdrs.hdr.cb_offset, hi, lo); + + if (hi) { + fprintf(stderr, "hi offset bits set in cb_offset; can't get callback_state_header\n"); + return 1; + } + if ((lo >= map_len) || + ((lo + sizeof(struct callback_state_header)) > map_len) || + (lo + sizeof(struct callback_state_header) < lo)) { + fprintf(stderr, "cb_offset puts callback_state_header beyond end of memory map\n"); + return 1; + } + + buf = (char *) map; + buf += lo; + memcpy(&hdrs.cb_hdr, buf, sizeof(struct callback_state_header)); + hdrs.cb_hdr_p = buf; + hdrs.cb_hdr_valid = 1; + + SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo); + + if (hi) { + fprintf(stderr, "hi offset bits set in fe_offset; can't get callback_state_entry_header\n"); + return 1; + } + hi = lo + (hdrs.cb_hdr.nFEs * (sizeof(struct callback_state_entry_header) + + sizeof(struct FEDiskEntry)) + + hdrs.cb_hdr.nCBs * sizeof(struct CBDiskEntry)); + if ((hi > map_len) || + (lo > hi)) { + fprintf(stderr, "fe_offset puts callback_state_entry_header beyond end of memory map\n"); + return 1; + } + + buf = (char *) map; + buf += lo; + fe_cursor.ffe = (void *)buf; + + return 0; +} + +static int +get_cb_timeout_hdr(void) +{ + char * buf; + afs_uint32 hi, lo; + + if (hdrs.timeout_hdr_valid) + return 0; + + if (get_cb_hdr()) + return 1; + + SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo); + + if (hi) { + fprintf(stderr, "hi offset bits set in timeout_offset; can't get callback_state_timeout_header\n"); + return 1; + } + if ((lo >= map_len) || + ((lo + sizeof(struct callback_state_timeout_header)) > map_len) || + (lo + sizeof(struct callback_state_timeout_header) < lo)) { + fprintf(stderr, "timeout_offset puts callback_state_timeout_header beyond end of memory map\n"); + return 1; + } + + buf = (char *) map; + buf += lo; + memcpy(&hdrs.timeout_hdr, buf, sizeof(struct callback_state_timeout_header)); + hdrs.timeout_hdr_p = buf; + hdrs.timeout_hdr_valid = 1; + buf += sizeof(struct callback_state_timeout_header); + hdrs.timeout_p = buf; + + return 0; +} + +static int +get_cb_timeout(void) +{ + char * buf; + + if (hdrs.timeout) + return 0; + + if (get_cb_timeout_hdr()) + return 1; + + hdrs.timeout = (afs_uint32 *) calloc(hdrs.timeout_hdr.records, sizeof(afs_uint32)); + assert(hdrs.timeout != NULL); + memcpy(hdrs.timeout, hdrs.timeout_p, hdrs.timeout_hdr.records * sizeof(afs_uint32)); + return 0; +} + +static int +get_cb_fehash_hdr(void) +{ + char * buf; + afs_uint32 hi, lo; + + if (hdrs.fehash_hdr_valid) + return 0; + + if (get_cb_hdr()) + return 1; + + SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo); + + if (hi) { + fprintf(stderr, "hi offset bits set in fehash_offset; can't get callback_state_fehash_header\n"); + return 1; + } + if ((lo >= map_len) || + ((lo + sizeof(struct callback_state_fehash_header)) > map_len) || + (lo + sizeof(struct callback_state_fehash_header) < lo)) { + fprintf(stderr, "timeout_offset puts callback_state_fehash_header beyond end of memory map\n"); + return 1; + } + + buf = (char *) map; + buf += lo; + memcpy(&hdrs.fehash_hdr, buf, sizeof(struct callback_state_fehash_header)); + hdrs.fehash_hdr_p = buf; + hdrs.fehash_hdr_valid = 1; + buf += sizeof(struct callback_state_fehash_header); + hdrs.fehash_p = buf; + + return 0; +} + +static int +get_cb_fehash(void) +{ + char * buf; + + if (hdrs.fehash) + return 0; + + if (get_cb_fehash_hdr()) + return 1; + + hdrs.fehash = (afs_uint32 *) calloc(hdrs.fehash_hdr.records, sizeof(afs_uint32)); + assert(hdrs.fehash != NULL); + memcpy(hdrs.fehash, hdrs.fehash_p, hdrs.fehash_hdr.records * sizeof(afs_uint32)); + return 0; +} + +static int +get_he(afs_uint32 idx) +{ + int i; + char * p; + + if (get_h_hdr()) + return 1; + + if (idx >= hdrs.h_hdr.records) + return 1; + + if (he_cursor.idx == idx && he_cursor.hdr_valid && he_cursor.he_valid) + return 0; + + he_cursor.hdr_valid = he_cursor.he_valid = 0; + + if (he_cache.cursor == NULL) { + he_cache.cursor = (void **) calloc(hdrs.h_hdr.records, sizeof(void *)); + assert(he_cache.cursor != NULL); + } + + if (idx && he_cache.cursor[idx-1] == NULL) { + for (i = 0; i < idx; i++) { + if (he_cache.cursor[i] == NULL) { + get_he(i); + } + } + } + + if (!idx) { + he_cursor.cursor = he_cursor.fh; + } else if (he_cursor.cursor == he_cache.cursor[idx-1]) { + p = (char *)he_cursor.cursor; + p += he_cursor.hdr.len; + he_cursor.cursor = (void *)p; + } else { + he_cursor.cursor = he_cache.cursor[idx-1]; + if (get_he_hdr()) + return 1; + p = (char *)he_cursor.cursor; + p += he_cursor.hdr.len; + he_cursor.cursor = (void *)p; + } + + he_cursor.idx = idx; + he_cache.cursor[idx] = he_cursor.cursor; + + if (get_he_hdr()) + return 1; + if (get_he_entry()) + return 1; + + return 0; +} + +static int +get_he_hdr(void) +{ + memcpy(&he_cursor.hdr, he_cursor.cursor, sizeof(struct host_state_entry_header)); + he_cursor.hdr_valid = 1; + return 0; +} + +static int +get_he_entry(void) +{ + char * p; + + if (!he_cursor.hdr_valid) { + if (get_he_hdr()) { + return 1; + } + } + + p = (char *) he_cursor.cursor; + p += sizeof(struct host_state_entry_header); + + memcpy(&he_cursor.he, p, sizeof(struct hostDiskEntry)); + + he_cursor.he_valid = 1; + p += sizeof(struct hostDiskEntry); + he_cursor.ifp = (void *)p; + if (he_cursor.hdr.interfaces) { + p += sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort)); + he_cursor.hcps = (void *)p; + } else { + he_cursor.hcps = he_cursor.ifp; + } + return 0; +} + +static int +get_fe(afs_uint32 idx) +{ + int i; + char * p; + + cb_cursor.cb_valid = 0; + + if (get_cb_hdr()) + return 1; + + if (idx >= hdrs.cb_hdr.nFEs) + return 1; + + if (fe_cursor.idx == idx && fe_cursor.hdr_valid && fe_cursor.fe_valid) + return 0; + + fe_cursor.hdr_valid = fe_cursor.fe_valid = 0; + + if (fe_cache.cursor == NULL) { + fe_cache.cursor = (void **) calloc(hdrs.cb_hdr.nFEs, sizeof(void *)); + assert(fe_cache.cursor != NULL); + } + + if (idx && fe_cache.cursor[idx-1] == NULL) { + for (i = 0; i < idx; i++) { + if (fe_cache.cursor[i] == NULL) { + get_fe(i); + } + } + } + + if (!idx) { + fe_cursor.cursor = fe_cursor.ffe; + } else if (fe_cursor.cursor == fe_cache.cursor[idx-1]) { + p = (char *)fe_cursor.cursor; + p += fe_cursor.hdr.len; + fe_cursor.cursor = (void *)p; + } else { + fe_cursor.cursor = fe_cache.cursor[idx-1]; + if (get_fe_hdr()) + return 1; + p = (char *)fe_cursor.cursor; + p += fe_cursor.hdr.len; + fe_cursor.cursor = (void *)p; + } + + fe_cursor.idx = idx; + fe_cache.cursor[idx] = fe_cursor.cursor; + + if (get_fe_hdr()) + return 1; + if (get_fe_entry()) + return 1; + + return 0; +} + +static int +get_fe_hdr(void) +{ + memcpy(&fe_cursor.hdr, fe_cursor.cursor, sizeof(struct callback_state_entry_header)); + fe_cursor.hdr_valid = 1; + return 0; +} + +static int +get_fe_entry(void) +{ + char * p; + + if (!fe_cursor.hdr_valid) { + if (get_fe_hdr()) { + return 1; + } + } + + p = (char *) fe_cursor.cursor; + p += sizeof(struct callback_state_entry_header); + + memcpy(&fe_cursor.fe, p, sizeof(struct FEDiskEntry)); + + fe_cursor.fe_valid = 1; + p += sizeof(struct FEDiskEntry); + fe_cursor.fcb = (void *)p; + return 0; +} + +static int +get_cb(afs_uint32 idx) +{ + int i; + char * p; + + if (get_fe(fe_cursor.idx)) + return 1; + + if (idx >= fe_cursor.hdr.nCBs) + return 1; + + if (idx == cb_cursor.idx && cb_cursor.cb_valid) + return 0; + + cb_cursor.cb_valid = 0; + + p = (char *)fe_cursor.fcb; + p += idx * sizeof(struct CBDiskEntry); + cb_cursor.cursor = (void *)p; + + cb_cursor.idx = idx; + + if (get_cb_entry()) + return 1; + + return 0; +} + +static int +get_cb_entry(void) +{ + memcpy(&cb_cursor.cb, cb_cursor.cursor, sizeof(struct CBDiskEntry)); + cb_cursor.cb_valid = 1; + return 0; +} + +static int +find_he_by_index(afs_uint32 idx) +{ + int i; + + if (get_h_hdr()) { + return 1; + } + + for (i = 0; i < hdrs.h_hdr.records; i++) { + if (get_he(i)) { + fprintf(stderr, "error getting he %d\n", i); + return 1; + } + if (he_cursor.he.index == idx) + break; + } + + if (i < hdrs.h_hdr.records) { + dump_this_he(); + return 0; + } + return 1; +} + +static int +find_fe_by_index(afs_uint32 idx) +{ + int i; + + if (get_cb_hdr()) { + return 1; + } + + for (i = 0; i < hdrs.cb_hdr.nFEs; i++) { + if (get_fe(i)) { + fprintf(stderr, "error getting fe %d\n", i); + return 1; + } + if (fe_cursor.fe.index == idx) + break; + } + + if (i < hdrs.cb_hdr.nFEs) { + dump_this_fe(); + return 0; + } + return 1; +} + +static int +find_fe_by_fid(afs_uint32 volid, afs_uint32 vnode, afs_uint32 unique) +{ + int i; + + if (get_cb_hdr()) { + return 1; + } + + for (i = 0; i < hdrs.cb_hdr.nFEs; i++) { + if (get_fe(i)) { + fprintf(stderr, "error getting fe %d\n", i); + return 1; + } + if ((fe_cursor.fe.fe.unique == unique) && + (fe_cursor.fe.fe.volid == volid) && + (fe_cursor.fe.fe.vnode == vnode)) + break; + } + + if (i < hdrs.cb_hdr.nFEs) { + dump_this_fe(); + return 0; + } + return 1; +} + +static int +find_cb_by_index(afs_uint32 idx) +{ + int i; + + if (get_fe_hdr()) { + return 1; + } + + for (i = 0; i < fe_cursor.hdr.nCBs; i++) { + if (get_cb(i)) { + fprintf(stderr, "error getting cb %d\n", i); + return 1; + } + if (cb_cursor.cb.index == idx) + break; + } + + if (i < fe_cursor.hdr.nCBs) { + dump_this_cb(); + return 0; + } + return 1; +} + +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/tvolser/Makefile.in b/src/tvolser/Makefile.in index 8b8b1a7578..bfeb3a24af 100644 --- a/src/tvolser/Makefile.in +++ b/src/tvolser/Makefile.in @@ -10,7 +10,7 @@ include @TOP_OBJDIR@/src/config/Makefile.config HELPER_SPLINT=@HELPER_SPLINT@ CC=${MT_CC} -CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG +CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT CCRULE=${CC} ${CFLAGS} -c $? @@ -36,8 +36,9 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o DIROBJS=buffer.o dir.o salvage.o -VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \ - clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o nuke.o +VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-client.o purge.o \ + clone.o devname.o common.o ihandle.o listinodes.o \ + namei_ops.o nuke.o salvsync-client.o daemon_com.o FSINTOBJS=# afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o @@ -138,7 +139,13 @@ partition.o: ${VOL}/partition.c nuke.o: ${VOL}/nuke.c ${COMPILE} -fssync.o: ${VOL}/fssync.c +fssync-client.o: ${VOL}/fssync-client.c + ${COMPILE} + +salvsync-client.o: ${VOL}/salvsync-client.c + ${COMPILE} + +daemon_com.o: ${VOL}/daemon_com.c ${COMPILE} purge.o: ${VOL}/purge.c diff --git a/src/util/Makefile.in b/src/util/Makefile.in index 7b8c36e3ea..ccf3446695 100644 --- a/src/util/Makefile.in +++ b/src/util/Makefile.in @@ -13,7 +13,7 @@ HELPER_SPLINT=@HELPER_SPLINT@ objects = assert.o base64.o casestrcpy.o ktime.o volparse.o hostparse.o \ hputil.o kreltime.o isathing.o get_krbrlm.o uuid.o serverLog.o \ dirpath.o fileutil.o netutils.o flipbase64.o fstab.o \ - afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o \ + afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o strnlen.o \ daemon.o rxkstats.o ${REGEX_OBJ} includes = \ @@ -134,6 +134,9 @@ strlcat.o: ${srcdir}/strlcat.c ${includes} strlcpy.o: ${srcdir}/strlcpy.c ${includes} ${CCOBJ} ${CFLAGS} -c ${srcdir}/strlcpy.c +strnlen.o: ${srcdir}/strnlen.c ${includes} + ${CCOBJ} ${CFLAGS} -c ${srcdir}/strnlen.c + daemon.o: ${srcdir}/daemon.c ${includes} ${CCOBJ} ${CFLAGS} -c ${srcdir}/daemon.c diff --git a/src/util/afsutil_prototypes.h b/src/util/afsutil_prototypes.h index 89f05365e6..2848da3641 100644 --- a/src/util/afsutil_prototypes.h +++ b/src/util/afsutil_prototypes.h @@ -173,6 +173,9 @@ extern size_t strlcpy(char *dst, const char *src, size_t siz); extern size_t strlcat(char *dst, const char *src, size_t siz); #endif +/* strn */ +extern size_t afs_strnlen(char * buf, size_t len); + /* sys.c */ @@ -184,6 +187,10 @@ extern void afs_htonuuid(afsUUID * uuidp); extern void afs_ntohuuid(afsUUID * uuidp); extern afs_int32 afs_uuid_create(afsUUID * uuid); extern u_short afs_uuid_hash(afsUUID * uuid); +#if !defined(KERNEL) && !defined(UKERNEL) +extern int afsUUID_from_string(const char *str, afsUUID * uuid); +extern int afsUUID_to_string(const afsUUID * uuid, char *str, size_t strsz); +#endif /* volparse.c */ extern afs_int32 volutil_GetPartitionID(char *aname); diff --git a/src/util/dirpath.c b/src/util/dirpath.c index ff856f9523..1e9d78da76 100644 --- a/src/util/dirpath.c +++ b/src/util/dirpath.c @@ -292,10 +292,17 @@ initDirPathArray(void) pathp = dirPathArray[AFSDIR_SERVER_SLVGLOG_FILEPATH_ID]; AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SLVGLOG_FILE); + pathp = dirPathArray[AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID]; + AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SALSRVLOG_FILE); + pathp = dirPathArray[AFSDIR_SERVER_SALVAGER_FILEPATH_ID]; AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR, AFSDIR_SALVAGER_FILE); + pathp = dirPathArray[AFSDIR_SERVER_SALSRV_FILEPATH_ID]; + AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR, + AFSDIR_SALSRV_FILE); + pathp = dirPathArray[AFSDIR_SERVER_SLVGLOCK_FILEPATH_ID]; AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_SLVGLOCK_FILE); @@ -368,6 +375,9 @@ initDirPathArray(void) pathp = dirPathArray[AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID]; AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_ETC_DIR, AFSDIR_KRB_EXCL_FILE); + pathp = dirPathArray[AFSDIR_SERVER_FSSTATE_FILEPATH_ID]; + AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_FSSTATE_FILE); + /* client file paths */ #ifdef AFS_NT40_ENV strcpy(dirPathArray[AFSDIR_CLIENT_THISCELL_FILEPATH_ID], diff --git a/src/util/dirpath.hin b/src/util/dirpath.hin index 23590ad4a9..ae1c46a78e 100644 --- a/src/util/dirpath.hin +++ b/src/util/dirpath.hin @@ -135,7 +135,9 @@ ConstructLocalLogPath(const char *cpath, #define AFSDIR_VLOG_FILE "VLLog" #define AFSDIR_CORE_FILE "core" #define AFSDIR_SLVGLOG_FILE "SalvageLog" +#define AFSDIR_SALSRVLOG_FILE "SalsrvLog" #define AFSDIR_SALVAGER_FILE "salvager" +#define AFSDIR_SALSRV_FILE "salvageserver" #define AFSDIR_SLVGLOCK_FILE "salvage.lock" #define AFSDIR_BOZCONF_FILE "BosConfig" #define AFSDIR_BOZCONFNEW_FILE "BosConfig.new" @@ -155,6 +157,8 @@ ConstructLocalLogPath(const char *cpath, #define AFSDIR_FILELOG_FILE "FileLog" #define AFSDIR_MIGRATE_LOGNAME "wtlog." +#define AFSDIR_FSSTATE_FILE "fsstate.dat" + #define AFSDIR_CELLSERVDB_FILE_NTCLIENT "afsdcell.ini" #define AFSDIR_NETINFO_FILE "NetInfo" @@ -194,9 +198,15 @@ AFSDIR_CANONICAL_SERVER_AFS_DIRPATH "/local" #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \ AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE +#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \ +AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE + #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \ AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE +#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \ +AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE + /* --------------------- Local path macros ---------------------- */ @@ -264,6 +274,9 @@ typedef enum afsdir_id { AFSDIR_SERVER_BIN_FILE_DIRPATH_ID, AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID, AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID, + AFSDIR_SERVER_SALSRV_FILEPATH_ID, + AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID, + AFSDIR_SERVER_FSSTATE_FILEPATH_ID, AFSDIR_PATHSTRING_MAX } afsdir_id_t; /* getDirPath() returns a pointer to a string from an internal array of path strings @@ -310,7 +323,9 @@ const char *getDirPath(afsdir_id_t string_id); #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID) #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID) #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID) +#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID) #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID) +#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID) #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID) #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID) #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID) @@ -332,6 +347,7 @@ const char *getDirPath(afsdir_id_t string_id); #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID) #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID) #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID) +#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID) /* client file paths */ #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID) diff --git a/src/util/dirpath_nt.h b/src/util/dirpath_nt.h index b0c62bc392..1d49d8155a 100644 --- a/src/util/dirpath_nt.h +++ b/src/util/dirpath_nt.h @@ -126,7 +126,9 @@ extern int #define AFSDIR_VLOG_FILE "VLLog" #define AFSDIR_CORE_FILE "core" #define AFSDIR_SLVGLOG_FILE "SalvageLog" +#define AFSDIR_SALSRVLOG_FILE "SalsrvLog" #define AFSDIR_SALVAGER_FILE "salvager" +#define AFSDIR_SALSRV_FILE "salvageserver" #define AFSDIR_SLVGLOCK_FILE "salvage.lock" #define AFSDIR_BOZCONF_FILE "BosConfig" #define AFSDIR_BOZCONFNEW_FILE "BosConfig.new" @@ -146,6 +148,8 @@ extern int #define AFSDIR_FILELOG_FILE "FileLog" #define AFSDIR_MIGRATE_LOGNAME "wtlog." +#define AFSDIR_FSSTATE_FILE "fsstate.dat" + #ifdef COMMENT #define AFSDIR_CELLSERVDB_FILE_NTCLIENT "afsdcell.ini" #else @@ -189,9 +193,15 @@ AFSDIR_LOCAL_DIR #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \ AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE +#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \ +AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE + #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \ AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE +#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \ +AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE + /* --------------------- Local path macros ---------------------- */ @@ -259,6 +269,9 @@ typedef enum afsdir_id { AFSDIR_SERVER_BIN_FILE_DIRPATH_ID, AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID, AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID, + AFSDIR_SERVER_SALSRV_FILEPATH_ID, + AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID, + AFSDIR_SERVER_FSSTATE_FILEPATH_ID, AFSDIR_PATHSTRING_MAX } afsdir_id_t; @@ -306,7 +319,9 @@ const char *getDirPath(afsdir_id_t string_id); #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID) #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID) #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID) +#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID) #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID) +#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID) #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID) #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID) #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID) @@ -328,6 +343,7 @@ const char *getDirPath(afsdir_id_t string_id); #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID) #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID) #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID) +#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID) /* client file paths */ #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID) diff --git a/src/util/errors.h b/src/util/errors.h index aa805d27ca..bc16dd6eb4 100644 --- a/src/util/errors.h +++ b/src/util/errors.h @@ -50,6 +50,7 @@ * to THIS server to find out where */ #define VIO 112 /* Vnode temporarily unaccessible, but not known * to be permanently bad. */ +#define VSALVAGING 113 /* Volume is being salvaged (demand attach fs) */ #define VRESTRICTED 120 /* Volume is restricted from using one or more * of the given residencies; do a * vos examine to find out the current diff --git a/src/util/strnlen.c b/src/util/strnlen.c new file mode 100644 index 0000000000..6c350df90d --- /dev/null +++ b/src/util/strnlen.c @@ -0,0 +1,35 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* strnlen.c - fixed length string length */ + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include + + +size_t +afs_strnlen(char * buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (buf[i] == '\0') + break; + } + + return i; +} + diff --git a/src/viced/Makefile.in b/src/viced/Makefile.in index 1b7d23f597..6de76052eb 100644 --- a/src/viced/Makefile.in +++ b/src/viced/Makefile.in @@ -50,6 +50,7 @@ headers=${TOP_INCDIR}/lwp.h \ ${TOP_INCDIR}/afs/afsint.h \ viced.h \ host.h \ + callback.h \ fs_stats.h objects=viced.o \ diff --git a/src/viced/NTMakefile b/src/viced/NTMakefile index 125d1ca6aa..0ffb6b7e95 100644 --- a/src/viced/NTMakefile +++ b/src/viced/NTMakefile @@ -5,6 +5,8 @@ # License. For details, see the LICENSE file in the top-level source # directory or online at http://www.openafs.org/dl/license10.html +AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER + RELDIR=viced !INCLUDE ..\config\NTMakefile.$(SYS_NAME) !INCLUDE ..\config\NTMakefile.version diff --git a/src/viced/afsfileprocs.c b/src/viced/afsfileprocs.c index 4743a2cb99..429a7de3a2 100644 --- a/src/viced/afsfileprocs.c +++ b/src/viced/afsfileprocs.c @@ -112,6 +112,7 @@ RCSID #include "viced_prototypes.h" #include "viced.h" #include "host.h" +#include "callback.h" #include #include #include @@ -209,7 +210,7 @@ extern afs_int32 readonlyServer; /* * Externals used by the xstat code. */ -extern int VolumeCacheSize, VolumeGets, VolumeReplacements; +extern VolPkgStats VStats; extern int CEs, CEBlocks; extern int HTs, HTBlocks; @@ -438,7 +439,7 @@ static afs_int32 CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock) { int fileCode = 0; - int errorCode = -1; + afs_int32 local_errorCode, errorCode = -1; static struct timeval restartedat = { 0, 0 }; if (fid->Volume == 0 || fid->Vnode == 0) /* not: || fid->Unique == 0) */ @@ -448,7 +449,7 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock) while (1) { errorCode = 0; - *volptr = VGetVolume(&errorCode, (afs_int32) fid->Volume); + *volptr = VGetVolume(&local_errorCode, &errorCode, (afs_int32) fid->Volume); if (!errorCode) { assert(*volptr); break; @@ -525,8 +526,10 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock) } } } - /* allow read operations on busy volume */ - else if (errorCode == VBUSY && lock == READ_LOCK) { + /* allow read operations on busy volume. + * must check local_errorCode because demand attach fs + * can have local_errorCode == VSALVAGING, errorCode == VBUSY */ + else if (local_errorCode == VBUSY && lock == READ_LOCK) { errorCode = 0; break; } else if (errorCode) @@ -1151,6 +1154,8 @@ CopyOnWrite(Vnode * targetptr, Volume * volptr) wrlen, errno)); #ifdef FAST_RESTART /* if running in no-salvage, don't core the server */ ViceLog(0, ("CopyOnWrite failed: taking volume offline\n")); +#elif defined(AFS_DEMAND_ATTACH_FS) + ViceLog(0, ("CopyOnWrite failed: requesting salvage\n")); #else /* Avoid further corruption and try to get a core. */ assert(0); #endif @@ -5564,7 +5569,7 @@ SRXAFS_XStatsVersion(struct rx_call * a_call, afs_int32 * a_versionP) static void FillPerfValues(struct afs_PerfStats *a_perfP) { /*FillPerfValues */ - + afs_uint32 hi, lo; int dir_Buffers; /*# buffers in use by dir package */ int dir_Calls; /*# read calls in dir package */ int dir_IOs; /*# I/O ops in dir package */ @@ -5582,9 +5587,11 @@ FillPerfValues(struct afs_PerfStats *a_perfP) a_perfP->vcache_S_Gets = VnodeClassInfo[vSmall].gets; a_perfP->vcache_S_Reads = VnodeClassInfo[vSmall].reads; a_perfP->vcache_S_Writes = VnodeClassInfo[vSmall].writes; - a_perfP->vcache_H_Entries = VolumeCacheSize; - a_perfP->vcache_H_Gets = VolumeGets; - a_perfP->vcache_H_Replacements = VolumeReplacements; + a_perfP->vcache_H_Entries = VStats.hdr_cache_size; + SplitInt64(VStats.hdr_gets, hi, lo); + a_perfP->vcache_H_Gets = lo; + SplitInt64(VStats.hdr_loads, hi, lo); + a_perfP->vcache_H_Replacements = lo; /* * Directory section. diff --git a/src/viced/callback.c b/src/viced/callback.c index 8c3040dc96..44b4523576 100644 --- a/src/viced/callback.c +++ b/src/viced/callback.c @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -120,94 +122,24 @@ RCSID #include /* need definition of prlist for host.h */ #include "host.h" +#include "callback.h" +#ifdef AFS_DEMAND_ATTACH_FS +#include "../tviced/serialize_state.h" +#endif /* AFS_DEMAND_ATTACH_FS */ + extern afsUUID FS_HostUUID; extern int hostCount; -int ShowProblems = 1; - -/* Maximum number of call backs to break at once, single fid */ -/* There is some debate as to just how large this value should be */ -/* Ideally, it would be very very large, but I am afraid that the */ -/* cache managers will all send in their responses simultaneously, */ -/* thereby swamping the file server. As a result, something like */ -/* 10 or 15 might be a better bet. */ -#define MAX_CB_HOSTS 10 - -/* max time to break a callback, otherwise client is dead or net is hosed */ -#define MAXCBT 25 - -#define u_byte unsigned char +static int ShowProblems = 1; struct cbcounters cbstuff; -struct cbstruct { - struct host *hp; - afs_uint32 thead; -}; +static struct FileEntry * FE = NULL; /* don't use FE[0] */ +static struct CallBack * CB = NULL; /* don't use CB[0] */ -struct FileEntry { - afs_uint32 vnode; - afs_uint32 unique; - afs_uint32 volid; - afs_uint32 fnext; - afs_uint32 ncbs; - afs_uint32 firstcb; - afs_uint32 status; - afs_uint32 spare; -} *FE; /* Don't use FE[0] */ -#define FE_LATER 0x1 +static struct CallBack * CBfree = NULL; +static struct FileEntry * FEfree = NULL; -struct CallBack { - afs_uint32 cnext; /* Next call back entry */ - afs_uint32 fhead; /* Head of this call back chain */ - u_byte thead; /* Head of timeout chain */ - u_byte status; /* Call back status; see definitions, below */ - afs_uint32 hhead; /* Head of host table chain */ - afs_uint32 tprev, tnext; /* Timeout chain */ - afs_uint32 hprev, hnext; /* Chain from host table */ - unsigned short spare; /* make it a multiple of 32 bits. */ -} *CB; /* Don't use CB[0] */ - -/* status values for status field of CallBack structure */ -#define CB_NORMAL 1 /* Normal call back */ -#define CB_DELAYED 2 /* Delayed call back due to rpc problems. - * The call back entry will be added back to the - * host list at the END of the list, so that - * searching backwards in the list will find all - * the (consecutive)host. delayed call back entries */ -#define CB_VOLUME 3 /* Callback for a volume */ -#define CB_BULK 4 /* Normal callbacks, handed out from FetchBulkStatus */ - -/* call back indices to pointers, and vice-versa */ -#define itocb(i) ((i)?CB+(i):0) -#define cbtoi(cbp) (!(cbp)?0:(cbp)-CB) - -/* file entry indices to pointers, and vice-versa */ -#define itofe(i) ((i)?FE+(i):0) -#define fetoi(fep) (!(fep)?0:(fep)-FE) - -/* Timeouts: there are 128 possible timeout values in effect at any - * given time. Each timeout represents timeouts in an interval of 128 - * seconds. So the maximum timeout for a call back is 128*128=16384 - * seconds, or 4 1/2 hours. The timeout cleanup stuff is called only - * if space runs out or by the file server every 5 minutes. This 5 - * minute slack should be allowed for--so a maximum time of 4 hours - * is safer. - * - * Timeouts must be chosen to correspond to an exact multiple - * of 128, because all times are truncated to a 128 multiple, and - * timed out if the current truncated time is <= to the truncated time - * corresponding to the timeout queue. - */ - -/* Unix time to Call Back time, and vice-versa. Call back time is - in units of 128 seconds, corresponding to time queues. */ -#define CBtime(uxtime) ((uxtime)>>7) -#define UXtime(cbtime) ((cbtime)<<7) - -/* Given a Unix time, compute the closest Unix time that corresponds to - a time queue, rounding up */ -#define TimeCeiling(uxtime) (((uxtime)+127)&~127) /* Time to live for call backs depends upon number of users of the file. * TimeOuts is indexed by this number/8 (using TimeOut macro). Times @@ -229,52 +161,17 @@ static int TimeOuts[] = { /* minimum time given for a call back */ static int MinTimeOut = (7 * 60); -#define TimeOutCutoff ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8) -#define TimeOut(nusers) ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3]) - -/* time out at server is 3 minutes more than ws */ -#define ServerBias (3*60) - /* Heads of CB queues; a timeout index is 1+index into this array */ -static afs_uint32 timeout[128]; - -/* Convert cbtime to timeout queue index */ -#define TIndex(cbtime) (((cbtime)&127)+1) - -/* Convert cbtime to pointer to timeout queue head */ -#define THead(cbtime) (&timeout[TIndex(cbtime)-1]) +static afs_uint32 timeout[CB_NUM_TIMEOUT_QUEUES]; static afs_int32 tfirst; /* cbtime of oldest unexpired call back time queue */ -/* Normalize index into timeout array so that two such indices will be - ordered correctly, so that they can be compared to see which times - sooner, or so that the difference in time out times between them - can be computed. */ -#define TNorm(index) ((index)Volume, fid->Unique); + hash = FEHash(fid->Volume, fid->Unique); for (fei = HashTable[hash]; fei; fei = fe->fnext) { fe = itofe(fei); if (fe->volid == fid->Volume && fe->unique == fid->Unique @@ -421,11 +317,11 @@ HAdd(register struct CallBack *cb, register struct host *host) if (!host->cblist) { host->cblist = cb->hnext = cb->hprev = cbtoi(cb); } else { - register struct CallBack *hhp = itocb(host->cblist); + register struct CallBack *fcb = itocb(host->cblist); - cb->hprev = hhp->hprev; - cb->hnext = host->cblist; - hhp->hprev = (itocb(hhp->hprev)->hnext = cbtoi(cb)); + cb->hprev = fcb->hprev; + cb->hnext = cbtoi(fcb); + fcb->hprev = (itocb(fcb->hprev)->hnext = cbtoi(cb)); } return 0; } @@ -475,7 +371,7 @@ CDel(struct CallBack *cb, int deletefe) /* N.B. This one also deletes the CB, and also possibly parent FE, so * make sure that it is not on any other list before calling this * routine */ -int Ccdelpt = 0, CcdelB = 0; +static int Ccdelpt = 0, CcdelB = 0; static int CDelPtr(register struct FileEntry *fe, register afs_uint32 * cbp, @@ -522,7 +418,7 @@ static int FDel(register struct FileEntry *fe) { register int fei = fetoi(fe); - register afs_uint32 *p = &HashTable[VHash(fe->volid, fe->unique)]; + register afs_uint32 *p = &HashTable[FEHash(fe->volid, fe->unique)]; while (*p && *p != fei) p = &itofe(*p)->fnext; @@ -532,6 +428,7 @@ FDel(register struct FileEntry *fe) return 0; } +/* initialize the callback package */ int InitCallBack(int nblks) { @@ -539,19 +436,21 @@ InitCallBack(int nblks) tfirst = CBtime(FT_ApproxTime()); /* N.B. The "-1", below, is because * FE[0] and CB[0] are not used--and not allocated */ - FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry)))) - 1; + FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry)))); if (!FE) { ViceLog(0, ("Failed malloc in InitCallBack\n")); assert(0); } + FE--; /* FE[0] is supposed to point to junk */ cbstuff.nFEs = nblks; while (cbstuff.nFEs) FreeFE(&FE[cbstuff.nFEs]); /* This is correct */ - CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack)))) - 1; + CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack)))); if (!CB) { ViceLog(0, ("Failed malloc in InitCallBack\n")); assert(0); } + CB--; /* CB[0] is supposed to point to junk */ cbstuff.nCBs = nblks; while (cbstuff.nCBs) FreeCB(&CB[cbstuff.nCBs]); /* This is correct */ @@ -696,7 +595,7 @@ AddCallBack1_r(struct host *host, AFSFid * fid, afs_uint32 * thead, int type, fe->unique = fid->Unique; fe->ncbs = 0; fe->status = 0; - hash = VHash(fid->Volume, fid->Unique); + hash = FEHash(fid->Volume, fid->Unique); fe->fnext = HashTable[hash]; HashTable[hash] = fetoi(fe); } @@ -1302,7 +1201,7 @@ BreakVolumeCallBacks(afs_uint32 volume) H_LOCK; fid.Volume = volume, fid.Vnode = fid.Unique = 0; - for (hash = 0; hash < VHASH; hash++) { + for (hash = 0; hash < FEHASH_SIZE; hash++) { for (feip = &HashTable[hash]; (fe = itofe(*feip));) { if (fe->volid == volume) { register struct CallBack *cbnext; @@ -1360,7 +1259,7 @@ int BreakVolumeCallBacksLater(afs_uint32 volume) { int hash; - afs_int32 *feip; + afs_uint32 *feip; struct FileEntry *fe; struct CallBack *cb; struct host *host; @@ -1368,7 +1267,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume) ViceLog(25, ("Setting later on volume %u\n", volume)); H_LOCK; - for (hash = 0; hash < VHASH; hash++) { + for (hash = 0; hash < FEHASH_SIZE; hash++) { for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) { if (fe->volid == volume) { register struct CallBack *cbnext; @@ -1381,7 +1280,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume) FSYNC_LOCK; fe->status |= FE_LATER; FSYNC_UNLOCK; - found++; + found = 1; } feip = &fe->fnext; } @@ -1408,7 +1307,7 @@ BreakLaterCallBacks(void) { struct AFSFid fid; int hash; - afs_int32 *feip; + afs_uint32 *feip; struct CallBack *cb; struct FileEntry *fe = NULL; struct FileEntry *myfe = NULL; @@ -1424,7 +1323,7 @@ BreakLaterCallBacks(void) /* Pick the first volume we see to clean up */ fid.Volume = fid.Vnode = fid.Unique = 0; - for (hash = 0; hash < VHASH; hash++) { + for (hash = 0; hash < FEHASH_SIZE; hash++) { for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) { if (fe && (fe->status & FE_LATER) && (fid.Volume == 0 || fid.Volume == fe->volid)) { @@ -1775,6 +1674,973 @@ PrintCallBackStats(void) #ifndef INTERPRET_DUMP +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * callback state serialization + */ +static int cb_stateSaveTimeouts(struct fs_dump_state * state); +static int cb_stateSaveFEHash(struct fs_dump_state * state); +static int cb_stateSaveFEs(struct fs_dump_state * state); +static int cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe); +static int cb_stateRestoreTimeouts(struct fs_dump_state * state); +static int cb_stateRestoreFEHash(struct fs_dump_state * state); +static int cb_stateRestoreFEs(struct fs_dump_state * state); +static int cb_stateRestoreFE(struct fs_dump_state * state); +static int cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, + struct iovec * iov, int niovecs); + +static int cb_stateVerifyFEHash(struct fs_dump_state * state); +static int cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe); +static int cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe); +static int cb_stateVerifyTimeoutQueues(struct fs_dump_state * state); + +static int cb_stateFEToDiskEntry(struct FileEntry *, struct FEDiskEntry *); +static int cb_stateDiskEntryToFE(struct fs_dump_state * state, + struct FEDiskEntry *, struct FileEntry *); + +static int cb_stateCBToDiskEntry(struct CallBack *, struct CBDiskEntry *); +static int cb_stateDiskEntryToCB(struct fs_dump_state * state, + struct CBDiskEntry *, struct CallBack *); + +static int cb_stateFillHeader(struct callback_state_header * hdr); +static int cb_stateCheckHeader(struct callback_state_header * hdr); + +static int cb_stateAllocMap(struct fs_dump_state * state); + +int +cb_stateSave(struct fs_dump_state * state) +{ + int ret = 0; + + AssignInt64(state->eof_offset, &state->hdr->cb_offset); + + /* invalidate callback state header */ + memset(state->cb_hdr, 0, sizeof(struct callback_state_header)); + if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr, + sizeof(struct callback_state_header))) { + ret = 1; + goto done; + } + + fs_stateIncEOF(state, sizeof(struct callback_state_header)); + + /* dump timeout state */ + if (cb_stateSaveTimeouts(state)) { + ret = 1; + goto done; + } + + /* dump fe hashtable state */ + if (cb_stateSaveFEHash(state)) { + ret = 1; + goto done; + } + + /* dump callback state */ + if (cb_stateSaveFEs(state)) { + ret = 1; + goto done; + } + + /* write the callback state header to disk */ + cb_stateFillHeader(state->cb_hdr); + if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr, + sizeof(struct callback_state_header))) { + ret = 1; + goto done; + } + + done: + return ret; +} + +int +cb_stateRestore(struct fs_dump_state * state) +{ + int ret = 0; + + if (fs_stateReadHeader(state, &state->hdr->cb_offset, state->cb_hdr, + sizeof(struct callback_state_header))) { + ret = 1; + goto done; + } + + if (cb_stateCheckHeader(state->cb_hdr)) { + ret = 1; + goto done; + } + + if (cb_stateAllocMap(state)) { + ret = 1; + goto done; + } + + if (cb_stateRestoreTimeouts(state)) { + ret = 1; + goto done; + } + + if (cb_stateRestoreFEHash(state)) { + ret = 1; + goto done; + } + + /* restore FEs and CBs from disk */ + if (cb_stateRestoreFEs(state)) { + ret = 1; + goto done; + } + + /* restore the timeout queue heads */ + tfirst = state->cb_hdr->tfirst; + + done: + return ret; +} + +int +cb_stateRestoreIndices(struct fs_dump_state * state) +{ + int i, ret = 0; + struct FileEntry * fe; + struct CallBack * cb; + + /* restore indices in the FileEntry structures */ + for (i = 1; i < state->fe_map.len; i++) { + if (state->fe_map.entries[i].new_idx) { + fe = itofe(state->fe_map.entries[i].new_idx); + + /* restore the fe->fnext entry */ + if (fe_OldToNew(state, fe->fnext, &fe->fnext)) { + ret = 1; + goto done; + } + + /* restore the fe->firstcb entry */ + if (cb_OldToNew(state, fe->firstcb, &fe->firstcb)) { + ret = 1; + goto done; + } + } + } + + /* restore indices in the CallBack structures */ + for (i = 1; i < state->cb_map.len; i++) { + if (state->cb_map.entries[i].new_idx) { + cb = itocb(state->cb_map.entries[i].new_idx); + + /* restore the cb->cnext entry */ + if (cb_OldToNew(state, cb->cnext, &cb->cnext)) { + ret = 1; + goto done; + } + + /* restore the cb->fhead entry */ + if (fe_OldToNew(state, cb->fhead, &cb->fhead)) { + ret = 1; + goto done; + } + + /* restore the cb->hhead entry */ + if (h_OldToNew(state, cb->hhead, &cb->hhead)) { + ret = 1; + goto done; + } + + /* restore the cb->tprev entry */ + if (cb_OldToNew(state, cb->tprev, &cb->tprev)) { + ret = 1; + goto done; + } + + /* restore the cb->tnext entry */ + if (cb_OldToNew(state, cb->tnext, &cb->tnext)) { + ret = 1; + goto done; + } + + /* restore the cb->hprev entry */ + if (cb_OldToNew(state, cb->hprev, &cb->hprev)) { + ret = 1; + goto done; + } + + /* restore the cb->hnext entry */ + if (cb_OldToNew(state, cb->hnext, &cb->hnext)) { + ret = 1; + goto done; + } + } + } + + /* restore the timeout queue head indices */ + for (i = 0; i < state->cb_timeout_hdr->records; i++) { + if (cb_OldToNew(state, timeout[i], &timeout[i])) { + ret = 1; + goto done; + } + } + + /* restore the FE hash table queue heads */ + for (i = 0; i < state->cb_fehash_hdr->records; i++) { + if (fe_OldToNew(state, HashTable[i], &HashTable[i])) { + ret = 1; + goto done; + } + } + + done: + return ret; +} + +int +cb_stateVerify(struct fs_dump_state * state) +{ + int ret = 0; + + if (cb_stateVerifyFEHash(state)) { + ret = 1; + } + + if (cb_stateVerifyTimeoutQueues(state)) { + ret = 1; + } + + done: + return ret; +} + +static int +cb_stateVerifyFEHash(struct fs_dump_state * state) +{ + int ret = 0, i; + struct FileEntry * fe; + afs_uint32 fei, chain_len; + + for (i = 0; i < FEHASH_SIZE; i++) { + chain_len = 0; + for (fei = HashTable[i], fe = itofe(fei); + fe; + fei = fe->fnext, fe = itofe(fei)) { + if (fei > cbstuff.nblks) { + ViceLog(0, ("cb_stateVerifyFEHash: error: index out of range (fei=%d)\n", fei)); + ret = 1; + break; + } + if (cb_stateVerifyFE(state, fe)) { + ret = 1; + } + if (chain_len > FS_STATE_FE_MAX_HASH_CHAIN_LEN) { + ViceLog(0, ("cb_stateVerifyFEHash: error: hash chain %d length exceeds %d; assuming there's a loop\n", + i, FS_STATE_FE_MAX_HASH_CHAIN_LEN)); + ret = 1; + break; + } + chain_len++; + } + } + + done: + return ret; +} + +static int +cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe) +{ + int ret = 0; + + if ((fe->firstcb && !fe->ncbs) || + (!fe->firstcb && fe->ncbs)) { + ViceLog(0, ("cb_stateVerifyFE: error: fe->firstcb does not agree with fe->ncbs (fei=%d, fe->firstcb=%d, fe->ncbs=%d)\n", + fetoi(fe), fe->firstcb, fe->ncbs)); + ret = 1; + } + if (cb_stateVerifyFCBList(state, fe)) { + ViceLog(0, ("cb_stateVerifyFE: error: FCBList failed verification (fei=%d)\n", fetoi(fe))); + ret = 1; + } + + done: + return ret; +} + +static int +cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe) +{ + int ret = 0; + afs_uint32 cbi, fei, chain_len = 0; + struct CallBack * cb; + + fei = fetoi(fe); + + for (cbi = fe->firstcb, cb = itocb(cbi); + cb; + cbi = cb->cnext, cb = itocb(cbi)) { + if (cbi > cbstuff.nblks) { + ViceLog(0, ("cb_stateVerifyFCBList: error: list index out of range (cbi=%d, ncbs=%d)\n", + cbi, cbstuff.nblks)); + ret = 1; + goto done; + } + if (cb->fhead != fei) { + ViceLog(0, ("cb_stateVerifyFCBList: error: cb->fhead != fei (fei=%d, cb->fhead=%d)\n", + fei, cb->fhead)); + ret = 1; + } + if (chain_len > FS_STATE_FCB_MAX_LIST_LEN) { + ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (fei=%d); assuming there's a loop\n", + FS_STATE_FCB_MAX_LIST_LEN, fei)); + ret = 1; + goto done; + } + chain_len++; + } + + if (fe->ncbs != chain_len) { + ViceLog(0, ("cb_stateVerifyFCBList: error: list length mismatch (len=%d, fe->ncbs=%d)\n", + chain_len, fe->ncbs)); + ret = 1; + } + + done: + return ret; +} + +int +cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host) +{ + int ret = 0; + afs_uint32 hi, chain_len, cbi; + struct CallBack *cb, *ncb; + + hi = h_htoi(host); + chain_len = 0; + + for (cbi = host->cblist, cb = itocb(cbi); + cb; + cbi = cb->hnext, cb = ncb) { + if (chain_len && (host->cblist == cbi)) { + /* we've wrapped around the circular list, and everything looks ok */ + break; + } + if (cb->hhead != hi) { + ViceLog(0, ("cb_stateVerifyHCBList: error: incorrect cb->hhead (cbi=%d, h->index=%d, cb->hhead=%d)\n", + cbi, hi, cb->hhead)); + ret = 1; + } + if (!cb->hprev || !cb->hnext) { + ViceLog(0, ("cb_stateVerifyHCBList: error: null index in circular list (cbi=%d, h->index=%d)\n", + cbi, hi)); + ret = 1; + goto done; + } + if ((cb->hprev > cbstuff.nblks) || + (cb->hnext > cbstuff.nblks)) { + ViceLog(0, ("cb_stateVerifyHCBList: error: list index out of range (cbi=%d, h->index=%d, cb->hprev=%d, cb->hnext=%d, nCBs=%d)\n", + cbi, hi, cb->hprev, cb->hnext, cbstuff.nblks)); + ret = 1; + goto done; + } + ncb = itocb(cb->hnext); + if (cbi != ncb->hprev) { + ViceLog(0, ("cb_stateVerifyHCBList: error: corrupt linked list (cbi=%d, h->index=%d)\n", + cbi, hi)); + ret = 1; + goto done; + } + if (chain_len > FS_STATE_HCB_MAX_LIST_LEN) { + ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (h->index=%d); assuming there's a loop\n", + FS_STATE_HCB_MAX_LIST_LEN, hi)); + ret = 1; + goto done; + } + chain_len++; + } + + done: + return ret; +} + +static int +cb_stateVerifyTimeoutQueues(struct fs_dump_state * state) +{ + int ret = 0, i; + afs_uint32 cbi, chain_len; + struct CallBack *cb, *ncb; + + for (i = 0; i < CB_NUM_TIMEOUT_QUEUES; i++) { + chain_len = 0; + for (cbi = timeout[i], cb = itocb(cbi); + cb; + cbi = cb->tnext, cb = ncb) { + if (chain_len && (cbi == timeout[i])) { + /* we've wrapped around the circular list, and everything looks ok */ + break; + } + if (cbi > cbstuff.nblks) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: list index out of range (cbi=%d, tindex=%d)\n", + cbi, i)); + ret = 1; + break; + } + if (itot(cb->thead) != &timeout[i]) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: cb->thead points to wrong timeout queue (tindex=%d, cbi=%d, cb->thead=%d)\n", + i, cbi, cb->thead)); + ret = 1; + } + if (!cb->tprev || !cb->tnext) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: null index in circular list (cbi=%d, tindex=%d)\n", + cbi, i)); + ret = 1; + break; + } + if ((cb->tprev > cbstuff.nblks) || + (cb->tnext > cbstuff.nblks)) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: list index out of range (cbi=%d, tindex=%d, cb->tprev=%d, cb->tnext=%d, nCBs=%d)\n", + cbi, i, cb->tprev, cb->tnext, cbstuff.nblks)); + ret = 1; + break; + } + ncb = itocb(cb->tnext); + if (cbi != ncb->tprev) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: corrupt linked list (cbi=%d, tindex=%d)\n", + cbi, i)); + ret = 1; + break; + } + if (chain_len > FS_STATE_TCB_MAX_LIST_LEN) { + ViceLog(0, ("cb_stateVerifyTimeoutQueues: list length exceeds %d (tindex=%d); assuming there's a loop\n", + FS_STATE_TCB_MAX_LIST_LEN, i)); + ret = 1; + break; + } + chain_len++; + } + } + + done: + return ret; +} + +static int +cb_stateSaveTimeouts(struct fs_dump_state * state) +{ + int ret = 0; + struct iovec iov[2]; + + AssignInt64(state->eof_offset, &state->cb_hdr->timeout_offset); + + memset(state->cb_timeout_hdr, 0, sizeof(struct callback_state_fehash_header)); + state->cb_timeout_hdr->magic = CALLBACK_STATE_TIMEOUT_MAGIC; + state->cb_timeout_hdr->records = CB_NUM_TIMEOUT_QUEUES; + state->cb_timeout_hdr->len = sizeof(struct callback_state_timeout_header) + + (state->cb_timeout_hdr->records * sizeof(afs_uint32)); + + iov[0].iov_base = (char *)state->cb_timeout_hdr; + iov[0].iov_len = sizeof(struct callback_state_timeout_header); + iov[1].iov_base = (char *)timeout; + iov[1].iov_len = sizeof(timeout); + + if (fs_stateSeek(state, &state->cb_hdr->timeout_offset)) { + ret = 1; + goto done; + } + + if (fs_stateWriteV(state, iov, 2)) { + ret = 1; + goto done; + } + + fs_stateIncEOF(state, state->cb_timeout_hdr->len); + + done: + return ret; +} + +static int +cb_stateRestoreTimeouts(struct fs_dump_state * state) +{ + int ret = 0, len; + + if (fs_stateReadHeader(state, &state->cb_hdr->timeout_offset, + state->cb_timeout_hdr, + sizeof(struct callback_state_timeout_header))) { + ret = 1; + goto done; + } + + if (state->cb_timeout_hdr->magic != CALLBACK_STATE_TIMEOUT_MAGIC) { + ret = 1; + goto done; + } + if (state->cb_timeout_hdr->records != CB_NUM_TIMEOUT_QUEUES) { + ret = 1; + goto done; + } + + len = state->cb_timeout_hdr->records * sizeof(afs_uint32); + + if (state->cb_timeout_hdr->len != + (sizeof(struct callback_state_timeout_header) + len)) { + ret = 1; + goto done; + } + + if (fs_stateRead(state, timeout, len)) { + ret = 1; + goto done; + } + + done: + return ret; +} + +static int +cb_stateSaveFEHash(struct fs_dump_state * state) +{ + int ret = 0; + struct iovec iov[2]; + + AssignInt64(state->eof_offset, &state->cb_hdr->fehash_offset); + + memset(state->cb_fehash_hdr, 0, sizeof(struct callback_state_fehash_header)); + state->cb_fehash_hdr->magic = CALLBACK_STATE_FEHASH_MAGIC; + state->cb_fehash_hdr->records = FEHASH_SIZE; + state->cb_fehash_hdr->len = sizeof(struct callback_state_fehash_header) + + (state->cb_fehash_hdr->records * sizeof(afs_uint32)); + + iov[0].iov_base = (char *)state->cb_fehash_hdr; + iov[0].iov_len = sizeof(struct callback_state_fehash_header); + iov[1].iov_base = (char *)HashTable; + iov[1].iov_len = sizeof(HashTable); + + if (fs_stateSeek(state, &state->cb_hdr->fehash_offset)) { + ret = 1; + goto done; + } + + if (fs_stateWriteV(state, iov, 2)) { + ret = 1; + goto done; + } + + fs_stateIncEOF(state, state->cb_fehash_hdr->len); + + done: + return ret; +} + +static int +cb_stateRestoreFEHash(struct fs_dump_state * state) +{ + int ret = 0, len; + + if (fs_stateReadHeader(state, &state->cb_hdr->fehash_offset, + state->cb_fehash_hdr, + sizeof(struct callback_state_fehash_header))) { + ret = 1; + goto done; + } + + if (state->cb_fehash_hdr->magic != CALLBACK_STATE_FEHASH_MAGIC) { + ret = 1; + goto done; + } + if (state->cb_fehash_hdr->records != FEHASH_SIZE) { + ret = 1; + goto done; + } + + len = state->cb_fehash_hdr->records * sizeof(afs_uint32); + + if (state->cb_fehash_hdr->len != + (sizeof(struct callback_state_fehash_header) + len)) { + ret = 1; + goto done; + } + + if (fs_stateRead(state, HashTable, len)) { + ret = 1; + goto done; + } + + done: + return ret; +} + +static int +cb_stateSaveFEs(struct fs_dump_state * state) +{ + int ret = 0; + register int fei, hash; + register struct FileEntry *fe; + + AssignInt64(state->eof_offset, &state->cb_hdr->fe_offset); + + for (hash = 0; hash < FEHASH_SIZE ; hash++) { + for (fei = HashTable[hash]; fei; fei = fe->fnext) { + fe = itofe(fei); + if (cb_stateSaveFE(state, fe)) { + ret = 1; + goto done; + } + } + } + + done: + return ret; +} + +static int +cb_stateRestoreFEs(struct fs_dump_state * state) +{ + int count, nFEs, ret = 0; + + nFEs = state->cb_hdr->nFEs; + + for (count = 0; count < nFEs; count++) { + if (cb_stateRestoreFE(state)) { + ret = 1; + goto done; + } + } + + done: + return ret; +} + +static int +cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe) +{ + int ret = 0, iovcnt, cbi, idx, len, written = 0; + afs_uint32 fei; + struct callback_state_entry_header hdr; + struct FEDiskEntry fedsk; + struct CBDiskEntry cbdsk[16]; + struct iovec iov[16]; + struct CallBack *cb; + + fei = fetoi(fe); + if (fei > state->cb_hdr->fe_max) { + state->cb_hdr->fe_max = fei; + } + + memset(&hdr, 0, sizeof(struct callback_state_entry_header)); + + if (cb_stateFEToDiskEntry(fe, &fedsk)) { + ret = 1; + goto done; + } + + iov[0].iov_base = (char *)&hdr; + len = iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = (char *)&fedsk; + len += iov[1].iov_len = sizeof(struct FEDiskEntry); + iovcnt = 2; + + for (cbi = fe->firstcb, cb = itocb(cbi), idx = 2; + cb != NULL; + cbi = cb->cnext, cb = itocb(cbi), idx++, hdr.nCBs++) { + if (cbi > state->cb_hdr->cb_max) { + state->cb_hdr->cb_max = cbi; + } + if (cb_stateCBToDiskEntry(cb, &cbdsk[idx])) { + ret = 1; + goto done; + } + cbdsk[idx].index = cbi; + iov[idx].iov_base = (char *)&cbdsk[idx]; + len += iov[idx].iov_len = sizeof(struct CBDiskEntry); + iovcnt++; + if ((iovcnt == 16) || (!cb->cnext)) { + if (fs_stateWriteV(state, iov, iovcnt)) { + ret = 1; + goto done; + } + written = 1; + iovcnt = 0; + len = 0; + } + } + + hdr.magic = CALLBACK_STATE_ENTRY_MAGIC; + hdr.len = sizeof(hdr) + sizeof(struct FEDiskEntry) + + (hdr.nCBs * sizeof(struct CBDiskEntry)); + + if (!written) { + if (fs_stateWriteV(state, iov, iovcnt)) { + ret = 1; + goto done; + } + } else { + if (fs_stateWriteHeader(state, &state->eof_offset, &hdr, sizeof(hdr))) { + ret = 1; + goto done; + } + } + + fs_stateIncEOF(state, hdr.len); + + if (written) { + if (fs_stateSeek(state, &state->eof_offset)) { + ret = 1; + goto done; + } + } + + state->cb_hdr->nFEs++; + state->cb_hdr->nCBs += hdr.nCBs; + + done: + return ret; +} + +static int +cb_stateRestoreFE(struct fs_dump_state * state) +{ + int ret = 0, iovcnt, len, nCBs, idx; + struct callback_state_entry_header hdr; + struct FEDiskEntry fedsk; + struct CBDiskEntry cbdsk[16]; + struct iovec iov[16]; + struct FileEntry * fe; + struct CallBack * cb; + + iov[0].iov_base = (char *)&hdr; + len = iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = (char *)&fedsk; + len += iov[1].iov_len = sizeof(fedsk); + iovcnt = 2; + + if (fs_stateReadV(state, iov, iovcnt)) { + ret = 1; + goto done; + } + + if (hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) { + ret = 1; + goto done; + } + + fe = GetFE(); + if (fe == NULL) { + ViceLog(0, ("cb_stateRestoreFE: ran out of free FileEntry structures\n")); + ret = 1; + goto done; + } + + if (cb_stateDiskEntryToFE(state, &fedsk, fe)) { + ret = 1; + goto done; + } + + if (hdr.nCBs) { + for (iovcnt = 0, idx = 0, len = 0, nCBs = 0; + nCBs < hdr.nCBs; + idx++, nCBs++) { + iov[idx].iov_base = (char *)&cbdsk[idx]; + len += iov[idx].iov_len = sizeof(struct CBDiskEntry); + iovcnt++; + if ((iovcnt == 16) || (nCBs == hdr.nCBs - 1)) { + if (fs_stateReadV(state, iov, iovcnt)) { + ret = 1; + goto done; + } + if (cb_stateRestoreCBs(state, fe, iov, iovcnt)) { + ret = 1; + goto done; + } + len = 0; + iovcnt = 0; + } + } + } + + done: + return ret; +} + +static int +cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, + struct iovec * iov, int niovecs) +{ + int ret = 0, idx; + register struct CallBack * cb; + struct CBDiskEntry * cbdsk; + afs_uint32 fei; + + fei = fetoi(fe); + + for (idx = 0; idx < niovecs; idx++) { + cbdsk = (struct CBDiskEntry *) iov[idx].iov_base; + if ((cb = GetCB()) == NULL) { + ViceLog(0, ("cb_stateRestoreCBs: ran out of free CallBack structures\n")); + ret = 1; + goto done; + } + if (cb_stateDiskEntryToCB(state, cbdsk, cb)) { + ViceLog(0, ("cb_stateRestoreCBs: corrupt CallBack disk entry\n")); + ret = 1; + goto done; + } + } + + done: + return ret; +} + + +static int +cb_stateFillHeader(struct callback_state_header * hdr) +{ + hdr->stamp.magic = CALLBACK_STATE_MAGIC; + hdr->stamp.version = CALLBACK_STATE_VERSION; + hdr->tfirst = tfirst; + return 0; +} + +static int +cb_stateCheckHeader(struct callback_state_header * hdr) +{ + int ret = 0; + + if (hdr->stamp.magic != CALLBACK_STATE_MAGIC) { + ret = 1; + } else if (hdr->stamp.version != CALLBACK_STATE_VERSION) { + ret = 1; + } else if ((hdr->nFEs > cbstuff.nblks) || (hdr->nCBs > cbstuff.nblks)) { + ViceLog(0, ("cb_stateCheckHeader: saved callback state larger than callback memory allocation\n")); + ret = 1; + } + return ret; +} + +/* disk entry conversion routines */ +static int +cb_stateFEToDiskEntry(struct FileEntry * in, struct FEDiskEntry * out) +{ + memcpy(&out->fe, in, sizeof(struct FileEntry)); + out->index = fetoi(in); + return 0; +} + +static int +cb_stateDiskEntryToFE(struct fs_dump_state * state, + struct FEDiskEntry * in, struct FileEntry * out) +{ + int ret = 0; + + memcpy(out, &in->fe, sizeof(struct FileEntry)); + + /* setup FE map entry */ + if (!in->index || (in->index >= state->fe_map.len)) { + ViceLog(0, ("cb_stateDiskEntryToFE: index (%d) out of range", + in->index)); + ret = 1; + goto done; + } + state->fe_map.entries[in->index].old_idx = in->index; + state->fe_map.entries[in->index].new_idx = fetoi(out); + + done: + return ret; +} + +static int +cb_stateCBToDiskEntry(struct CallBack * in, struct CBDiskEntry * out) +{ + memcpy(&out->cb, in, sizeof(struct CallBack)); + out->index = cbtoi(in); + return 0; +} + +static int +cb_stateDiskEntryToCB(struct fs_dump_state * state, + struct CBDiskEntry * in, struct CallBack * out) +{ + int ret = 0; + + memcpy(out, &in->cb, sizeof(struct CallBack)); + + /* setup CB map entry */ + if (!in->index || (in->index >= state->cb_map.len)) { + ViceLog(0, ("cb_stateDiskEntryToCB: index (%d) out of range\n", + in->index)); + ret = 1; + goto done; + } + state->cb_map.entries[in->index].old_idx = in->index; + state->cb_map.entries[in->index].new_idx = cbtoi(out); + + done: + return ret; +} + +/* index map routines */ +static int +cb_stateAllocMap(struct fs_dump_state * state) +{ + state->fe_map.len = state->cb_hdr->fe_max + 1; + state->cb_map.len = state->cb_hdr->cb_max + 1; + state->fe_map.entries = (struct idx_map_entry_t *) + calloc(state->fe_map.len, sizeof(struct idx_map_entry_t)); + state->cb_map.entries = (struct idx_map_entry_t *) + calloc(state->cb_map.len, sizeof(struct idx_map_entry_t)); + return ((state->fe_map.entries != NULL) && (state->cb_map.entries != NULL)) ? 0 : 1; +} + +int +fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new) +{ + int ret = 0; + + /* FEs use a one-based indexing system, so old==0 implies no mapping */ + if (!old) { + *new = 0; + goto done; + } + + if (old >= state->fe_map.len) { + ViceLog(0, ("fe_OldToNew: index %d is out of range\n", old)); + ret = 1; + } else if (state->fe_map.entries[old].old_idx != old) { /* sanity check */ + ViceLog(0, ("fe_OldToNew: index %d points to an invalid FileEntry record\n", old)); + ret = 1; + } else { + *new = state->fe_map.entries[old].new_idx; + } + + done: + return ret; +} + +int +cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new) +{ + int ret = 0; + + /* CBs use a one-based indexing system, so old==0 implies no mapping */ + if (!old) { + *new = 0; + goto done; + } + + if (old >= state->cb_map.len) { + ViceLog(0, ("cb_OldToNew: index %d is out of range\n", old)); + ret = 1; + } else if (state->cb_map.entries[old].old_idx != old) { /* sanity check */ + ViceLog(0, ("cb_OldToNew: index %d points to an invalid CallBack record\n", old)); + ret = 1; + } else { + *new = state->cb_map.entries[old].new_idx; + } + + done: + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + int DumpCallBackState(void) { @@ -1807,7 +2673,7 @@ DumpCallBackState(void) return 0; } -#endif +#endif /* !INTERPRET_DUMP */ #ifdef INTERPRET_DUMP @@ -1931,7 +2797,7 @@ main(int argc, char **argv) struct CallBack *cb; struct FileEntry *fe; - for (hash = 0; hash < VHASH; hash++) { + for (hash = 0; hash < FEHASH_SIZE; hash++) { for (feip = &HashTable[hash]; fe = itofe(*feip);) { if (!vol || (fe->volid == vol)) { register struct CallBack *cbnext; @@ -2201,6 +3067,15 @@ MultiProbeAlternateAddress_r(struct host *host) H_UNLOCK; } } +#ifdef AFS_DEMAND_ATTACH_FS + /* try to bail ASAP if the fileserver is shutting down */ + FS_STATE_RDLOCK; + if (fs_state.mode == FS_MODE_SHUTDOWN) { + FS_STATE_UNLOCK; + multi_Abort; + } + FS_STATE_UNLOCK; +#endif } multi_End_Ignore; H_LOCK; diff --git a/src/viced/callback.h b/src/viced/callback.h new file mode 100644 index 0000000000..2f4cca8036 --- /dev/null +++ b/src/viced/callback.h @@ -0,0 +1,158 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates + */ + +#ifndef _AFS_VICED_CALLBACK_H +#define _AFS_VICED_CALLBACK_H + +/* Maximum number of call backs to break at once, single fid + * There is some debate as to just how large this value should be + * Ideally, it would be very very large, but I am afraid that the + * cache managers will all send in their responses simultaneously, + * thereby swamping the file server. As a result, something like + * 10 or 15 might be a better bet. + */ +#define MAX_CB_HOSTS 10 + +/* max time to break a callback, otherwise client is dead or net is hosed */ +#define MAXCBT 25 + +#define u_byte unsigned char + +struct cbcounters { + afs_int32 DeleteFiles; + afs_int32 DeleteCallBacks; + afs_int32 BreakCallBacks; + afs_int32 AddCallBacks; + afs_int32 GotSomeSpaces; + afs_int32 DeleteAllCallBacks; + afs_int32 nFEs, nCBs, nblks; + afs_int32 CBsTimedOut; + afs_int32 nbreakers; + afs_int32 GSS1, GSS2, GSS3, GSS4, GSS5; +}; +extern struct cbcounters cbstuff; + +struct cbstruct { + struct host *hp; + afs_uint32 thead; +}; + +/* structure MUST be multiple of 8 bytes, otherwise the casts to + * struct object will have alignment issues on *P64 userspaces */ +struct FileEntry { + afs_uint32 vnode; + afs_uint32 unique; + afs_uint32 volid; + afs_uint32 fnext; /* index of next FE in hash chain */ + afs_uint32 ncbs; /* number of callbacks for this FE */ + afs_uint32 firstcb; /* index of first cb in per-FE list */ + afs_uint32 status; /* status bits for this FE */ + afs_uint32 spare; +}; +#define FE_LATER 0x1 + +/* structure MUST be multiple of 8 bytes, otherwise the casts to + * struct object will have alignment issues on *P64 userspaces */ +struct CallBack { + afs_uint32 cnext; /* index of next cb in per-FE list */ + afs_uint32 fhead; /* index of associated FE */ + u_byte thead; /* Head of timeout chain */ + u_byte status; /* Call back status; see definitions, below */ + unsigned short spare; /* ensure proper alignment */ + afs_uint32 hhead; /* Head of host table chain */ + afs_uint32 tprev, tnext; /* per-timeout circular list of callbacks */ + afs_uint32 hprev, hnext; /* per-host circular list of callbacks */ +}; + +struct VCBParams { + struct cbstruct cba[MAX_CB_HOSTS]; /* re-entrant storage */ + unsigned int ncbas; + afs_uint32 thead; /* head of timeout queue for youngest callback */ + struct AFSFid *fid; +}; + + +/* callback hash macros */ +#define FEHASH_SIZE 512 /* Power of 2 */ +#define FEHASH_MASK (FEHASH_SIZE-1) +#define FEHash(volume, unique) (((volume)+(unique))&(FEHASH_MASK)) + +#define CB_NUM_TIMEOUT_QUEUES 128 + + +/* status values for status field of CallBack structure */ +#define CB_NORMAL 1 /* Normal call back */ +#define CB_DELAYED 2 /* Delayed call back due to rpc problems. + * The call back entry will be added back to the + * host list at the END of the list, so that + * searching backwards in the list will find all + * the (consecutive)host. delayed call back entries */ +#define CB_VOLUME 3 /* Callback for a volume */ +#define CB_BULK 4 /* Normal callbacks, handed out from FetchBulkStatus */ + +/* call back indices to pointers, and vice-versa */ +#define itocb(i) ((i)?CB+(i):0) +#define cbtoi(cbp) (!(cbp)?0:(cbp)-CB) + +/* file entry indices to pointers, and vice-versa */ +#define itofe(i) ((i)?FE+(i):0) +#define fetoi(fep) (!(fep)?0:(fep)-FE) + +/* Timeouts: there are 128 possible timeout values in effect at any + * given time. Each timeout represents timeouts in an interval of 128 + * seconds. So the maximum timeout for a call back is 128*128=16384 + * seconds, or 4 1/2 hours. The timeout cleanup stuff is called only + * if space runs out or by the file server every 5 minutes. This 5 + * minute slack should be allowed for--so a maximum time of 4 hours + * is safer. + * + * Timeouts must be chosen to correspond to an exact multiple + * of 128, because all times are truncated to a 128 multiple, and + * timed out if the current truncated time is <= to the truncated time + * corresponding to the timeout queue. + */ + +/* Unix time to Call Back time, and vice-versa. Call back time is + in units of 128 seconds, corresponding to time queues. */ +#define CBtime(uxtime) ((uxtime)>>7) +#define UXtime(cbtime) ((cbtime)<<7) + +/* Given a Unix time, compute the closest Unix time that corresponds to + a time queue, rounding up */ +#define TimeCeiling(uxtime) (((uxtime)+127)&~127) + +#define TimeOutCutoff ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8) +#define TimeOut(nusers) ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3]) + +/* time out at server is 3 minutes more than ws */ +#define ServerBias (3*60) + +/* Convert cbtime to timeout queue index */ +#define TIndex(cbtime) (((cbtime)&127)+1) + +/* Convert cbtime to pointer to timeout queue head */ +#define THead(cbtime) (&timeout[TIndex(cbtime)-1]) + +/* Normalize index into timeout array so that two such indices will be + ordered correctly, so that they can be compared to see which times + sooner, or so that the difference in time out times between them + can be computed. */ +#define TNorm(index) ((index) @@ -59,7 +61,11 @@ RCSID #include "viced_prototypes.h" #include "viced.h" #include "host.h" - +#include "callback.h" +#ifdef AFS_DEMAND_ATTACH_FS +#include "../util/afsutil_prototypes.h" +#include "../tviced/serialize_state.h" +#endif /* AFS_DEMAND_ATTACH_FS */ #ifdef AFS_PTHREAD_ENV pthread_mutex_t host_glock_mutex; @@ -83,6 +89,13 @@ int hostCount = 0; /* number of hosts in hostList */ int rxcon_ident_key; int rxcon_client_key; +static struct rx_securityClass *sc = NULL; + +static void h_SetupCallbackConn_r(struct host * host); +static void h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host * host); +static void h_AddHostToUuidHashTable_r(afsUUID * uuid, struct host * host); +static int h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host); + #define CESPERBLOCK 73 struct CEBlock { /* block of CESPERBLOCK file entries */ struct client entry[CESPERBLOCK]; @@ -232,9 +245,9 @@ GetHT() { register struct host *entry; - if (HTFree == 0) + if (HTFree == NULL) GetHTBlock(); - assert(HTFree != 0); + assert(HTFree != NULL); entry = HTFree; HTFree = entry->next; HTs++; @@ -448,7 +461,7 @@ h_gethostcps_r(register struct host *host, register afs_int32 now) free(host->hcps.prlist_val); /* this is for hostaclRefresh */ host->hcps.prlist_val = NULL; host->hcps.prlist_len = 0; - slept ? (host->cpsCall = FT_ApproxTime()) : (host->cpsCall = now); + host->cpsCall = slept ? (FT_ApproxTime()) : (now); H_UNLOCK; code = pr_GetHostCPS(ntohl(host->host), &host->hcps); @@ -533,7 +546,6 @@ h_Alloc_r(register struct rx_connection *r_con) { struct servent *serverentry; struct host *host; - static struct rx_securityClass *sc = 0; afs_int32 now; #if FS_STATS_DETAILED afs_uint32 newHostAddr_HBO; /*New host IP addr, in host byte order */ @@ -544,7 +556,7 @@ h_Alloc_r(register struct rx_connection *r_con) host->host = rxr_HostOf(r_con); host->port = rxr_PortOf(r_con); - hashInsert_r(host->host, host->port, host); + h_AddHostToHashTable_r(host->host, host->port, host); if (consolePort == 0) { /* find the portal number for console */ #if defined(AFS_OSF_ENV) @@ -561,24 +573,17 @@ h_Alloc_r(register struct rx_connection *r_con) host->Console = 1; /* Make a callback channel even for the console, on the off chance that it * makes a request that causes a break call back. It shouldn't. */ - { - if (!sc) - sc = rxnull_NewClientSecurityObject(); - host->callback_rxcon = - rx_NewConnection(host->host, host->port, 1, sc, 0); - rx_SetConnDeadTime(host->callback_rxcon, 50); - rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME); - } + h_SetupCallbackConn_r(host); now = host->LastCall = host->cpsCall = host->ActiveCall = FT_ApproxTime(); host->hostFlags = 0; host->hcps.prlist_val = NULL; host->hcps.prlist_len = 0; - host->interface = 0; + host->interface = NULL; #ifdef undef host->hcpsfailed = 0; /* save cycles */ h_gethostcps(host); /* do this under host hold/lock */ #endif - host->FirstClient = 0; + host->FirstClient = NULL; h_Hold_r(host); h_Lock_r(host); h_InsertList_r(host); /* update global host List */ @@ -596,6 +601,20 @@ h_Alloc_r(register struct rx_connection *r_con) } /*h_Alloc_r */ + +/* Make a callback channel even for the console, on the off chance that it + * makes a request that causes a break call back. It shouldn't. */ +static void +h_SetupCallbackConn_r(struct host * host) +{ + if (!sc) + sc = rxnull_NewClientSecurityObject(); + host->callback_rxcon = + rx_NewConnection(host->host, host->port, 1, sc, 0); + rx_SetConnDeadTime(host->callback_rxcon, 50); + rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME); +} + /* Lookup a host given an IP address and UDP port number. */ /* hostaddr and hport are in network order */ /* Note: host should be released by caller if 0 == *heldp and non-null */ @@ -833,7 +852,7 @@ h_FreeConnection(struct rx_connection *tcon) if (client) { H_LOCK; if (client->tcon == tcon) - client->tcon = (struct rx_connection *)0; + client->tcon = NULL; H_UNLOCK; } return 0; @@ -878,8 +897,11 @@ h_Enumerate(int (*proc) (), char *param) H_UNLOCK; for (i = 0; i < count; i++) { held[i] = (*proc) (list[i], held[i], param); - if (!held[i]) + if (!H_ENUMERATE_ISSET_HELD(held[i])) h_Release(list[i]); /* this might free up the host */ + /* bail out of the enumeration early */ + if (H_ENUMERATE_ISSET_BAIL(held[i])) + break; } free((void *)list); free((void *)held); @@ -908,17 +930,19 @@ h_Enumerate_r(int (*proc) (), struct host *enumstart, char *param) h_Hold_r(enumstart); for (host = enumstart; host; host = next, held = nheld) { next = host->next; - if (next && !(nheld = h_Held_r(next))) + if (next && !(nheld = h_Held_r(next)) && !H_ENUMERATE_ISSET_BAIL(held)) h_Hold_r(next); held = (*proc) (host, held, param); - if (!held) + if (!H_ENUMERATE_ISSET_HELD(held)) h_Release_r(host); /* this might free up the host */ + if (H_ENUMERATE_ISSET_BAIL(held)) + break; } } /*h_Enumerate_r */ /* inserts a new HashChain structure corresponding to this UUID */ -void -hashInsertUuid_r(struct afsUUID *uuid, struct host *host) +static void +h_AddHostToUuidHashTable_r(struct afsUUID *uuid, struct host *host) { int index; struct h_hashChain *chain; @@ -929,7 +953,7 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host) /* insert into beginning of list for this bucket */ chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain)); if (!chain) { - ViceLog(0, ("Failed malloc in hashInsertUuid_r\n")); + ViceLog(0, ("Failed malloc in h_AddHostToUuidHashTable_r\n")); assert(0); } assert(chain); @@ -940,8 +964,8 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host) /* inserts a new HashChain structure corresponding to this address */ -void -hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host) +static void +h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host *host) { int index; struct h_hashChain *chain; @@ -952,7 +976,7 @@ hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host) /* insert into beginning of list for this bucket */ chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain)); if (!chain) { - ViceLog(0, ("Failed malloc in hashInsert_r\n")); + ViceLog(0, ("Failed malloc in h_AddHostToHashTable_r\n")); assert(0); } chain->hostPtr = host; @@ -1017,7 +1041,7 @@ addInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port) /* * Create a hash table entry for this address */ - hashInsert_r(addr, port, host); + h_AddHostToHashTable_r(addr, port, host); return 0; } @@ -1072,7 +1096,7 @@ removeInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port) /* * Remove the hash table entry for this address */ - hashDelete_r(addr, port, host); + h_DeleteHostFromHashTableByAddr_r(addr, port, host); return 0; } @@ -1394,7 +1418,7 @@ h_GetHost_r(struct rx_connection *tcon) /* the new host is held and locked */ } else { /* This really is a new host */ - hashInsertUuid_r(&identP->uuid, host); + h_AddHostToUuidHashTable_r(&identP->uuid, host); cb_conn = host->callback_rxcon; rx_GetConnection(cb_conn); H_UNLOCK; @@ -1735,7 +1759,7 @@ h_FindClient_r(struct rx_connection *tcon) client->authClass = authClass; /* rx only */ client->sid = rxr_CidOf(tcon); client->VenusEpoch = rxr_GetEpoch(tcon); - client->CPS.prlist_val = 0; + client->CPS.prlist_val = NULL; client->CPS.prlist_len = 0; h_Unlock_r(host); } @@ -2134,6 +2158,540 @@ h_DumpHosts() } /*h_DumpHosts */ +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * host state serialization + */ +static int h_stateFillHeader(struct host_state_header * hdr); +static int h_stateCheckHeader(struct host_state_header * hdr); +static int h_stateAllocMap(struct fs_dump_state * state); +static int h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state); +static int h_stateRestoreHost(struct fs_dump_state * state); +static int h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state); +static int h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state); +static int h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port); +static int h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h); +static void h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out); +static void h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out); + + +/* this procedure saves all host state to disk for fast startup */ +int +h_stateSave(struct fs_dump_state * state) +{ + AssignInt64(state->eof_offset, &state->hdr->h_offset); + + /* XXX debug */ + ViceLog(0, ("h_stateSave: hostCount=%d\n", hostCount)); + + /* invalidate host state header */ + memset(state->h_hdr, 0, sizeof(struct host_state_header)); + + if (fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr, + sizeof(struct host_state_header))) { + state->bail = 1; + goto done; + } + + fs_stateIncEOF(state, sizeof(struct host_state_header)); + + h_Enumerate_r(h_stateSaveHost, hostList, (char *)state); + if (state->bail) { + goto done; + } + + h_stateFillHeader(state->h_hdr); + + /* write the real header to disk */ + state->bail = fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr, + sizeof(struct host_state_header)); + + done: + return state->bail; +} + +/* demand attach fs + * host state serialization + * + * this procedure restores all host state from a disk for fast startup + */ +int +h_stateRestore(struct fs_dump_state * state) +{ + int i, records; + + /* seek to the right position and read in the host state header */ + if (fs_stateReadHeader(state, &state->hdr->h_offset, state->h_hdr, + sizeof(struct host_state_header))) { + state->bail = 1; + goto done; + } + + /* check the validity of the header */ + if (h_stateCheckHeader(state->h_hdr)) { + state->bail = 1; + goto done; + } + + records = state->h_hdr->records; + + if (h_stateAllocMap(state)) { + state->bail = 1; + goto done; + } + + /* iterate over records restoring host state */ + for (i=0; i < records; i++) { + if (h_stateRestoreHost(state) != 0) { + state->bail = 1; + break; + } + } + + done: + return state->bail; +} + +int +h_stateRestoreIndices(struct fs_dump_state * state) +{ + h_Enumerate_r(h_stateRestoreIndex, hostList, (char *)state); + return state->bail; +} + +static int +h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state) +{ + if (cb_OldToNew(state, h->cblist, &h->cblist)) { + return H_ENUMERATE_BAIL(held); + } + return held; +} + +int +h_stateVerify(struct fs_dump_state * state) +{ + h_Enumerate_r(h_stateVerifyHost, hostList, (char *)state); + return state->bail; +} + +static int +h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state) +{ + int i; + + if (h == NULL) { + ViceLog(0, ("h_stateVerifyHost: error: NULL host pointer in linked list\n")); + return H_ENUMERATE_BAIL(held); + } + + if (h->interface) { + for (i = h->interface->numberOfInterfaces-1; i >= 0; i--) { + if (h_stateVerifyAddrHash(state, h, h->interface->interface[i].addr, + h->interface->interface[i].port)) { + state->bail = 1; + } + } + if (h_stateVerifyUuidHash(state, h)) { + state->bail = 1; + } + } else if (h_stateVerifyAddrHash(state, h, h->host, h->port)) { + state->bail = 1; + } + + if (cb_stateVerifyHCBList(state, h)) { + state->bail = 1; + } + + done: + return held; +} + +static int +h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port) +{ + int ret = 0, found = 0; + struct host *host = NULL; + struct h_hashChain *chain; + int index = h_HashIndex(addr); + char tmp[16]; + int chain_len = 0; + + for (chain = hostHashTable[index]; chain; chain = chain->next) { + host = chain->hostPtr; + if (host == NULL) { + afs_inet_ntoa_r(addr, tmp); + ViceLog(0, ("h_stateVerifyAddrHash: error: addr hash chain has NULL host ptr (lookup addr %s)\n", tmp)); + ret = 1; + goto done; + } + if ((chain->addr == addr) && (chain->port == port)) { + if (host != h) { + ViceLog(0, ("h_stateVerifyAddrHash: warning: addr hash entry points to different host struct (%d, %d)\n", + h->index, host->index)); + state->flags.warnings_generated = 1; + } + found = 1; + break; + } + if (chain_len > FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN) { + ViceLog(0, ("h_stateVerifyAddrHash: error: hash chain length exceeds %d; assuming there's a loop\n", + FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN)); + ret = 1; + goto done; + } + chain_len++; + } + + if (!found) { + afs_inet_ntoa_r(addr, tmp); + if (state->mode == FS_STATE_LOAD_MODE) { + ViceLog(0, ("h_stateVerifyAddrHash: error: addr %s not found in hash\n", tmp)); + ret = 1; + goto done; + } else { + ViceLog(0, ("h_stateVerifyAddrHash: warning: addr %s not found in hash\n", tmp)); + state->flags.warnings_generated = 1; + } + } + + done: + return ret; +} + +static int +h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h) +{ + int ret = 0, found = 0; + struct host *host = NULL; + struct h_hashChain *chain; + afsUUID * uuidp = &h->interface->uuid; + int index = h_UuidHashIndex(uuidp); + char tmp[40]; + int chain_len = 0; + + for (chain = hostUuidHashTable[index]; chain; chain = chain->next) { + host = chain->hostPtr; + if (host == NULL) { + afsUUID_to_string(uuidp, tmp, sizeof(tmp)); + ViceLog(0, ("h_stateVerifyUuidHash: error: uuid hash chain has NULL host ptr (lookup uuid %s)\n", tmp)); + ret = 1; + goto done; + } + if (host->interface && + afs_uuid_equal(&host->interface->uuid, uuidp)) { + if (host != h) { + ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid hash entry points to different host struct (%d, %d)\n", + h->index, host->index)); + state->flags.warnings_generated = 1; + } + found = 1; + goto done; + } + if (chain_len > FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN) { + ViceLog(0, ("h_stateVerifyUuidHash: error: hash chain length exceeds %d; assuming there's a loop\n", + FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN)); + ret = 1; + goto done; + } + chain_len++; + } + + if (!found) { + afsUUID_to_string(uuidp, tmp, sizeof(tmp)); + if (state->mode == FS_STATE_LOAD_MODE) { + ViceLog(0, ("h_stateVerifyUuidHash: error: uuid %s not found in hash\n", tmp)); + ret = 1; + goto done; + } else { + ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid %s not found in hash\n", tmp)); + state->flags.warnings_generated = 1; + } + } + + done: + return ret; +} + +/* create the host state header structure */ +static int +h_stateFillHeader(struct host_state_header * hdr) +{ + hdr->stamp.magic = HOST_STATE_MAGIC; + hdr->stamp.version = HOST_STATE_VERSION; +} + +/* check the contents of the host state header structure */ +static int +h_stateCheckHeader(struct host_state_header * hdr) +{ + int ret=0; + + if (hdr->stamp.magic != HOST_STATE_MAGIC) { + ViceLog(0, ("check_host_state_header: invalid state header\n")); + ret = 1; + } + else if (hdr->stamp.version != HOST_STATE_VERSION) { + ViceLog(0, ("check_host_state_header: unknown version number\n")); + ret = 1; + } + return ret; +} + +/* allocate the host id mapping table */ +static int +h_stateAllocMap(struct fs_dump_state * state) +{ + state->h_map.len = state->h_hdr->index_max + 1; + state->h_map.entries = (struct idx_map_entry_t *) + calloc(state->h_map.len, sizeof(struct idx_map_entry_t)); + return (state->h_map.entries != NULL) ? 0 : 1; +} + +/* function called by h_Enumerate to save a host to disk */ +static int +h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state) +{ + int i, if_len=0, hcps_len=0; + struct hostDiskEntry hdsk; + struct host_state_entry_header hdr; + struct Interface * ifp = NULL; + afs_int32 * hcps = NULL; + struct iovec iov[4]; + int iovcnt = 2; + + memset(&hdr, 0, sizeof(hdr)); + + if (state->h_hdr->index_max < host->index) { + state->h_hdr->index_max = host->index; + } + + h_hostToDiskEntry_r(host, &hdsk); + if (host->interface) { + if_len = sizeof(struct Interface) + + ((host->interface->numberOfInterfaces-1) * sizeof(struct AddrPort)); + ifp = (struct Interface *) malloc(if_len); + assert(ifp != NULL); + memcpy(ifp, host->interface, if_len); + hdr.interfaces = host->interface->numberOfInterfaces; + iov[iovcnt].iov_base = (char *) ifp; + iov[iovcnt].iov_len = if_len; + iovcnt++; + } + if (host->hcps.prlist_val) { + hdr.hcps = host->hcps.prlist_len; + hcps_len = hdr.hcps * sizeof(afs_int32); + hcps = (afs_int32 *) malloc(hcps_len); + assert(hcps != NULL); + memcpy(hcps, host->hcps.prlist_val, hcps_len); + iov[iovcnt].iov_base = (char *) hcps; + iov[iovcnt].iov_len = hcps_len; + iovcnt++; + } + + if (hdsk.index > state->h_hdr->index_max) + state->h_hdr->index_max = hdsk.index; + + hdr.len = sizeof(struct host_state_entry_header) + + sizeof(struct hostDiskEntry) + if_len + hcps_len; + hdr.magic = HOST_STATE_ENTRY_MAGIC; + + iov[0].iov_base = (char *) &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = (char *) &hdsk; + iov[1].iov_len = sizeof(struct hostDiskEntry); + + if (fs_stateWriteV(state, iov, iovcnt)) { + ViceLog(0, ("h_stateSaveHost: failed to save host %d", host->index)); + state->bail = 1; + } + + fs_stateIncEOF(state, hdr.len); + + state->h_hdr->records++; + + done: + if (ifp) + free(ifp); + if (hcps) + free(hcps); + if (state->bail) { + return H_ENUMERATE_BAIL(held); + } + return held; +} + +/* restores a host from disk */ +static int +h_stateRestoreHost(struct fs_dump_state * state) +{ + int ifp_len=0, hcps_len=0, bail=0; + struct host_state_entry_header hdr; + struct hostDiskEntry hdsk; + struct host *host = NULL; + struct Interface *ifp = NULL; + afs_int32 * hcps = NULL; + struct iovec iov[3]; + int iovcnt = 1; + + if (fs_stateRead(state, &hdr, sizeof(hdr))) { + ViceLog(0, ("h_stateRestoreHost: failed to read host entry header from dump file '%s'\n", + state->fn)); + bail = 1; + goto done; + } + + if (hdr.magic != HOST_STATE_ENTRY_MAGIC) { + ViceLog(0, ("h_stateRestoreHost: fileserver state dump file '%s' is corrupt.\n", + state->fn)); + bail = 1; + goto done; + } + + iov[0].iov_base = (char *) &hdsk; + iov[0].iov_len = sizeof(struct hostDiskEntry); + + if (hdr.interfaces) { + ifp_len = sizeof(struct Interface) + + ((hdr.interfaces-1) * sizeof(struct AddrPort)); + ifp = (struct Interface *) malloc(ifp_len); + assert(ifp != NULL); + iov[iovcnt].iov_base = (char *) ifp; + iov[iovcnt].iov_len = ifp_len; + iovcnt++; + } + if (hdr.hcps) { + hcps_len = hdr.hcps * sizeof(afs_int32); + hcps = (afs_int32 *) malloc(hcps_len); + assert(hcps != NULL); + iov[iovcnt].iov_base = (char *) hcps; + iov[iovcnt].iov_len = hcps_len; + iovcnt++; + } + + if ((ifp_len + hcps_len + sizeof(hdsk) + sizeof(hdr)) != hdr.len) { + ViceLog(0, ("h_stateRestoreHost: host entry header length fields are inconsistent\n")); + bail = 1; + goto done; + } + + if (fs_stateReadV(state, iov, iovcnt)) { + ViceLog(0, ("h_stateRestoreHost: failed to read host entry\n")); + bail = 1; + goto done; + } + + if (!hdr.hcps && hdsk.hcps_valid) { + /* valid, zero-length host cps ; does this ever happen? */ + hcps = (afs_int32 *) malloc(sizeof(afs_int32)); + assert(hcps != NULL); + } + + host = GetHT(); + assert(host != NULL); + + if (ifp) { + host->interface = ifp; + } + if (hcps) { + host->hcps.prlist_val = hcps; + host->hcps.prlist_len = hdr.hcps; + } + + h_diskEntryToHost_r(&hdsk, host); + h_SetupCallbackConn_r(host); + + if (ifp) { + int i; + for (i = ifp->numberOfInterfaces-1; i >= 0; i--) { + h_AddHostToHashTable_r(ifp->interface[i].addr, + ifp->interface[i].port, host); + } + h_AddHostToUuidHashTable_r(&ifp->uuid, host); + } else { + h_AddHostToHashTable_r(host->host, host->port, host); + } + h_InsertList_r(host); + + /* setup host id map entry */ + state->h_map.entries[hdsk.index].old_idx = hdsk.index; + state->h_map.entries[hdsk.index].new_idx = host->index; + + done: + if (bail) { + if (ifp) + free(ifp); + if (hcps) + free(hcps); + } + return bail; +} + +/* serialize a host structure to disk */ +static void +h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out) +{ + out->host = in->host; + out->port = in->port; + out->hostFlags = in->hostFlags; + out->Console = in->Console; + out->hcpsfailed = in->hcpsfailed; + out->LastCall = in->LastCall; + out->ActiveCall = in->ActiveCall; + out->cpsCall = in->cpsCall; + out->cblist = in->cblist; +#ifdef FS_STATS_DETAILED + out->InSameNetwork = in->InSameNetwork; +#endif + + /* special fields we save, but are not memcpy'd back on restore */ + out->index = in->index; + out->hcps_len = in->hcps.prlist_len; + out->hcps_valid = (in->hcps.prlist_val == NULL) ? 0 : 1; +} + +/* restore a host structure from disk */ +static void +h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out) +{ + out->host = in->host; + out->port = in->port; + out->hostFlags = in->hostFlags; + out->Console = in->Console; + out->hcpsfailed = in->hcpsfailed; + out->LastCall = in->LastCall; + out->ActiveCall = in->ActiveCall; + out->cpsCall = in->cpsCall; + out->cblist = in->cblist; +#ifdef FS_STATS_DETAILED + out->InSameNetwork = in->InSameNetwork; +#endif +} + +/* index translation routines */ +int +h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new) +{ + int ret = 0; + + /* hosts use a zero-based index, so old==0 is valid */ + + if (old >= state->h_map.len) { + ViceLog(0, ("h_OldToNew: index %d is out of range\n", old)); + ret = 1; + } else if (state->h_map.entries[old].old_idx != old) { /* sanity check */ + ViceLog(0, ("h_OldToNew: index %d points to an invalid host record\n", old)); + ret = 1; + } else { + *new = state->h_map.entries[old].new_idx; + } + + done: + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + /* * This counts the number of workstations, the number of active workstations, @@ -2348,13 +2906,23 @@ static struct AFSFid zerofid; * Since it can serialize them, and pile up, it should be a separate LWP * from other events. */ -int +static int CheckHost(register struct host *host, int held) { register struct client *client; struct rx_connection *cb_conn = NULL; int code; +#ifdef AFS_DEMAND_ATTACH_FS + /* kill the checkhost lwp ASAP during shutdown */ + FS_STATE_RDLOCK; + if (fs_state.mode == FS_MODE_SHUTDOWN) { + FS_STATE_UNLOCK; + return H_ENUMERATE_BAIL(held); + } + FS_STATE_UNLOCK; +#endif + /* Host is held by h_Enumerate */ H_LOCK; for (client = host->FirstClient; client; client = client->next) { @@ -2455,7 +3023,7 @@ CheckHost(register struct host *host, int held) * This routine is called roughly every 5 minutes. */ void -h_CheckHosts() +h_CheckHosts(void) { afs_uint32 now = FT_ApproxTime(); @@ -2570,7 +3138,7 @@ initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf) /* deleted a HashChain structure for this address and host */ /* returns 1 on success */ static int -hashDelete_r(afs_uint32 addr, afs_uint16 port, struct host *host) +h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host) { int flag; register struct h_hashChain **hp, *th; diff --git a/src/viced/host.h b/src/viced/host.h index bd17cfd156..60df3bcea7 100644 --- a/src/viced/host.h +++ b/src/viced/host.h @@ -5,8 +5,13 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ +#ifndef _AFS_VICED_HOST_H +#define _AFS_VICED_HOST_H + #include "fs_stats.h" /*File Server stats package */ #ifdef AFS_PTHREAD_ENV @@ -59,6 +64,7 @@ struct Interface { struct AddrPort interface[1];/* there are actually more than one here */ /* in network byte order */ }; + struct host { struct host *next, *prev; /* linked list of all hosts */ struct rx_connection *callback_rxcon; /* rx callback connection */ @@ -85,7 +91,7 @@ struct host { struct client *FirstClient; /* first connection from host */ afs_uint32 cpsCall; /* time of last cps call from this host */ struct Interface *interface; /* all alternate addr for client */ - afs_uint32 cblist; /* Call back list for this host */ + afs_uint32 cblist; /* index of a cb in the per-host circular CB list */ /* * These don't get zeroed, keep them at the end. If index doesn't * follow an unsigned short then we need to pad to ensure that @@ -142,6 +148,7 @@ struct client { /* Don't zero the lock */ #define CLIENT_TO_ZERO(C) ((int)(((char *)(&((C)->lock))-(char *)(C)))) + /* * key for the client structure stored in connection specific data */ @@ -245,6 +252,19 @@ extern void h_CheckHosts(); struct Interface *MultiVerifyInterface_r(); extern int initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf); +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * state serialization + */ +extern int h_SaveState(void); +extern int h_RestoreState(void); +#endif + +#define H_ENUMERATE_BAIL(held) ((held)|0x80000000) +#define H_ENUMERATE_ISSET_BAIL(held) ((held)&0x80000000) +#define H_ENUMERATE_ISSET_HELD(held) ((held)&0x7FFFFFFF) + struct host *(hosttableptrs[h_MAXHOSTTABLES]); /* Used by h_itoh */ #define h_htoi(host) ((host)->index) /* index isn't zeroed, no need to lock */ #define h_itoh(hostindex) (hosttableptrs[(hostindex)>>h_HTSHIFT]+((hostindex)&(h_HTSPERBLOCK-1))) @@ -269,4 +289,4 @@ struct host *(hosttableptrs[h_MAXHOSTTABLES]); /* Used by h_itoh */ #define HFE_LATER 0x80 /* host has FE_LATER callbacks */ #define HERRORTRANS 0x100 /* do error translation */ - +#endif /* _AFS_VICED_HOST_H */ diff --git a/src/viced/viced.c b/src/viced/viced.c index 1202d933a4..1c7296bf22 100644 --- a/src/viced/viced.c +++ b/src/viced/viced.c @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* viced.c - File Server main loop */ @@ -215,6 +217,27 @@ afsUUID FS_HostUUID; static void FlagMsg(); +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * fileserver mode support + * + * during fileserver shutdown, we have to track the graceful shutdown of + * certain background threads before we are allowed to dump state to + * disk + */ +struct fs_state fs_state = + { FS_MODE_NORMAL, + 0, + 0, + 0, + 0, + { 1,1,1,1 }, + PTHREAD_COND_INITIALIZER, + PTHREAD_RWLOCK_INITIALIZER + }; +#endif /* AFS_DEMAND_ATTACH_FS */ + /* * Home for the performance statistics. */ @@ -420,13 +443,31 @@ FiveMinuteCheckLWP() ViceLog(1, ("Starting five minute check process\n")); setThreadId("FiveMinuteCheckLWP"); + +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + while (fs_state.mode == FS_MODE_NORMAL) { + fs_state.FiveMinuteLWP_tranquil = 1; + FS_STATE_UNLOCK; +#else while (1) { +#endif + #ifdef AFS_PTHREAD_ENV sleep(fiveminutes); #else /* AFS_PTHREAD_ENV */ IOMGR_Sleep(fiveminutes); #endif /* AFS_PTHREAD_ENV */ +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + if (fs_state.mode != FS_MODE_NORMAL) { + break; + } + fs_state.FiveMinuteLWP_tranquil = 0; + FS_STATE_UNLOCK; +#endif + /* close the log so it can be removed */ ReOpenLog(AFSDIR_SERVER_FILELOG_FILEPATH); /* don't trunc, just append */ ViceLog(2, ("Cleaning up timed out callbacks\n")); @@ -452,7 +493,17 @@ FiveMinuteCheckLWP() afs_ctime(&now, tbuffer, sizeof(tbuffer)))); } } +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; +#endif } +#ifdef AFS_DEMAND_ATTACH_FS + fs_state.FiveMinuteLWP_tranquil = 1; + FS_LOCK; + assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0); + FS_UNLOCK; + FS_STATE_UNLOCK; +#endif } /*FiveMinuteCheckLWP */ @@ -460,20 +511,50 @@ FiveMinuteCheckLWP() * other 5 minute activities because it may be delayed by timeouts when * it probes the workstations */ + static void HostCheckLWP() { ViceLog(1, ("Starting Host check process\n")); setThreadId("HostCheckLWP"); - while (1) { +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + while (fs_state.mode == FS_MODE_NORMAL) { + fs_state.HostCheckLWP_tranquil = 1; + FS_STATE_UNLOCK; +#else + while(1) { +#endif + #ifdef AFS_PTHREAD_ENV sleep(fiveminutes); #else /* AFS_PTHREAD_ENV */ IOMGR_Sleep(fiveminutes); #endif /* AFS_PTHREAD_ENV */ + +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + if (fs_state.mode != FS_MODE_NORMAL) { + break; + } + fs_state.HostCheckLWP_tranquil = 0; + FS_STATE_UNLOCK; +#endif + ViceLog(2, ("Checking for dead venii & clients\n")); h_CheckHosts(); + +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; +#endif } +#ifdef AFS_DEMAND_ATTACH_FS + fs_state.HostCheckLWP_tranquil = 1; + FS_LOCK; + assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0); + FS_UNLOCK; + FS_STATE_UNLOCK; +#endif } /*HostCheckLWP */ /* This LWP does fsync checks every 5 minutes: it should not be used for @@ -496,7 +577,14 @@ FsyncCheckLWP() assert(pthread_mutex_init(&fsync_glock_mutex, NULL) == 0); #endif - while (1) { +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + while (fs_state.mode == FS_MODE_NORMAL) { + fs_state.FsyncCheckLWP_tranquil = 1; + FS_STATE_UNLOCK; +#else + while(1) { +#endif FSYNC_LOCK; #ifdef AFS_PTHREAD_ENV /* rounding is fine */ @@ -513,11 +601,31 @@ FsyncCheckLWP() ViceLog(0, ("LWP_WaitProcess returned %d\n", code)); #endif /* AFS_PTHREAD_ENV */ FSYNC_UNLOCK; + +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + if (fs_state.mode != FS_MODE_NORMAL) { + break; + } + fs_state.FsyncCheckLWP_tranquil = 0; + FS_STATE_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + ViceLog(2, ("Checking for fsync events\n")); do { code = BreakLaterCallBacks(); } while (code != 0); +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; +#endif } +#ifdef AFS_DEMAND_ATTACH_FS + fs_state.FsyncCheckLWP_tranquil = 1; + FS_LOCK; + assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0); + FS_UNLOCK; + FS_STATE_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ } /*------------------------------------------------------------------------ @@ -604,6 +712,11 @@ PrintCounters() ("Vice was last started at %s\n", afs_ctime(&StartTime, tbuffer, sizeof(tbuffer)))); +#ifdef AFS_DEMAND_ATTACH_FS + /* XXX perhaps set extended stats verbosity flags + * based upon LogLevel ?? */ + VPrintExtendedCacheStats(VOL_STATS_PER_CHAIN2); +#endif VPrintCacheStats(); VPrintDiskStats(); DStat(&dirbuff, &dircall, &dirio); @@ -656,6 +769,16 @@ ShutDownAndCore(int dopanic) time_t now = time(0); char tbuffer[32]; + /* do not allows new reqests to be served from now on, all new requests + * are returned with an error code of RX_RESTARTING ( transient failure ) */ + rx_SetRxTranquil(); /* dhruba */ + +#ifdef AFS_DEMAND_ATTACH_FS + FS_STATE_WRLOCK; + fs_state.mode = FS_MODE_SHUTDOWN; + FS_STATE_UNLOCK; +#endif + ViceLog(0, ("Shutting down file server at %s", afs_ctime(&now, tbuffer, sizeof(tbuffer)))); @@ -671,11 +794,34 @@ ShutDownAndCore(int dopanic) if (!dopanic) PrintCounters(); - /* do not allows new reqests to be served from now on, all new requests - * are returned with an error code of RX_RESTARTING ( transient failure ) */ - rx_SetRxTranquil(); /* dhruba */ + /* shut down volume package */ VShutdown(); +#ifdef AFS_DEMAND_ATTACH_FS + if (fs_state.options.fs_state_save) { + /* + * demand attach fs + * save fileserver state to disk */ + + /* make sure background threads have finished all of their asynchronous + * work on host and callback structures */ + FS_STATE_RDLOCK; + while (!fs_state.FiveMinuteLWP_tranquil || + !fs_state.HostCheckLWP_tranquil || + !fs_state.FsyncCheckLWP_tranquil) { + FS_LOCK; + FS_STATE_UNLOCK; + ViceLog(0, ("waiting for background host/callback threads to quiesce before saving fileserver state...\n")); + assert(pthread_cond_wait(&fs_state.worker_done_cv, &fileproc_glock_mutex) == 0); + FS_UNLOCK; + FS_STATE_RDLOCK; + } + + /* ok. it should now be fairly safe. let's do the state dump */ + fs_stateSave(); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + if (debugFile) { rx_PrintStats(debugFile); fflush(debugFile); @@ -715,7 +861,7 @@ ShutDown(void) static void FlagMsg() { - char buffer[1024]; + char buffer[2048]; /* default supports help flag */ @@ -743,8 +889,18 @@ FlagMsg() strcat(buffer, "[-rxdbg (enable rx debugging)] "); strcat(buffer, "[-rxdbge (enable rxevent debugging)] "); strcat(buffer, "[-rxmaxmtu ] "); -#if AFS_PTHREAD_ENV - strcat(buffer, "[-vattachpar ] "); +#ifdef AFS_DEMAND_ATTACH_FS + strcat(buffer, "[-fs-state-dont-save (disable state save during shutdown)] "); + strcat(buffer, "[-fs-state-dont-restore (disable state restore during startup)] "); + strcat(buffer, "[-fs-state-verify (default is both)] "); + strcat(buffer, "[-vattachpar (default is 1)] "); + strcat(buffer, "[-vhashsize (default is 8)] "); + strcat(buffer, "[-vlrudisable (disable VLRU functionality)] "); + strcat(buffer, "[-vlruthresh (default is 2 hours)] "); + strcat(buffer, "[-vlruinterval (default is 2 minutes)] "); + strcat(buffer, "[-vlrumax (default is 8)] "); +#elif AFS_PTHREAD_ENV + strcat(buffer, "[-vattachpar (default is 1)] "); #endif #ifdef AFS_AIX32_ENV strcat(buffer, "[-m ] "); @@ -945,11 +1101,62 @@ ParseArgs(int argc, char *argv[]) #ifdef AFS_PTHREAD_ENV } else if (!strcmp(argv[i], "-vattachpar")) { if ((i + 1) >= argc) { - fprintf(stderr, "missing argument for -vattachpar\n"); + fprintf(stderr, "missing argument for %s\n", argv[i]); return -1; } vol_attach_threads = atoi(argv[++i]); #endif /* AFS_PTHREAD_ENV */ +#ifdef AFS_DEMAND_ATTACH_FS + } else if (!strcmp(argv[i], "-fs-state-dont-save")) { + fs_state.options.fs_state_save = 0; + } else if (!strcmp(argv[i], "-fs-state-dont-restore")) { + fs_state.options.fs_state_restore = 0; + } else if (!strcmp(argv[i], "-fs-state-verify")) { + if ((i + 1) >= argc) { + fprintf(stderr, "missing argument for %s\n", argv[i]); + return -1; + } + i++; + if (!strcmp(argv[i], "none")) { + fs_state.options.fs_state_verify_before_save = 0; + fs_state.options.fs_state_verify_after_restore = 0; + } else if (!strcmp(argv[i], "save")) { + fs_state.options.fs_state_verify_after_restore = 0; + } else if (!strcmp(argv[i], "restore")) { + fs_state.options.fs_state_verify_before_save = 0; + } else if (!strcmp(argv[i], "both")) { + /* default */ + } else { + fprintf(stderr, "invalid argument for %s\n", argv[i-1]); + return -1; + } + } else if (!strcmp(argv[i], "-vhashsize")) { + if ((i + 1) >= argc) { + fprintf(stderr, "missing argument for %s\n", argv[i]); + return -1; + } + VSetVolHashSize(atoi(argv[++i])); + } else if (!strcmp(argv[i], "-vlrudisable")) { + VLRU_SetOptions(VLRU_SET_ENABLED, 0); + } else if (!strcmp(argv[i], "-vlruthresh")) { + if ((i + 1) >= argc) { + fprintf(stderr, "missing argument for %s\n", argv[i]); + return -1; + } + VLRU_SetOptions(VLRU_SET_THRESH, 60*atoi(argv[++i])); + } else if (!strcmp(argv[i], "-vlruinterval")) { + if ((i + 1) >= argc) { + fprintf(stderr, "missing argument for %s\n", argv[i]); + return -1; + } + VLRU_SetOptions(VLRU_SET_INTERVAL, atoi(argv[++i])); + } else if (!strcmp(argv[i], "-vlrumax")) { + if ((i + 1) >= argc) { + fprintf(stderr, "missing argument for %s\n", argv[i]); + return -1; + } + VLRU_SetOptions(VLRU_SET_MAX, atoi(argv[++i])); +#endif /* AFS_DEMAND_ATTACH_FS */ } else if (!strcmp(argv[i], "-s")) { Sawsmall = 1; if ((i + 1) >= argc) { @@ -1923,6 +2130,15 @@ main(int argc, char *argv[]) exit(1); } +#ifdef AFS_DEMAND_ATTACH_FS + if (fs_state.options.fs_state_restore) { + /* + * demand attach fs + * restore fileserver state */ + fs_stateRestore(); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + /* * We are done calling fopen/fdopen. It is safe to use a large * of the file descriptor cache. diff --git a/src/viced/viced.h b/src/viced/viced.h index 3b230e5311..d8c837cad8 100644 --- a/src/viced/viced.h +++ b/src/viced/viced.h @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* file.h - include file for the File Server */ @@ -20,6 +22,9 @@ * Start with clean version to sync test and dev trees. * */ +#ifndef _AFS_VICED_VICED_H +#define _AFS_VICED_VICED_H + #include #include #include "fs_stats.h" /*Defs for xstat-based statistics */ @@ -46,18 +51,6 @@ typedef struct DirHandle { } DirHandle; -struct cbcounters { - int DeleteFiles; - int DeleteCallBacks; - int BreakCallBacks; - int AddCallBacks; - int GotSomeSpaces; - int DeleteAllCallBacks; - int nFEs, nCBs, nblks; - int CBsTimedOut; - int nbreakers; - int GSS1, GSS2, GSS3, GSS4, GSS5; -}; #define MAXCNTRS (AFS_HIGHEST_OPCODE+1) @@ -219,3 +212,46 @@ extern pthread_mutex_t fsync_glock_mutex; #define FSYNC_LOCK #define FSYNC_UNLOCK #endif /* AFS_PTHREAD_ENV */ + + +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * fileserver mode support + */ +struct fs_state { + volatile int mode; + volatile byte FiveMinuteLWP_tranquil; /* five minute check thread is shutdown or sleeping */ + volatile byte HostCheckLWP_tranquil; /* host check thread is shutdown or sleeping */ + volatile byte FsyncCheckLWP_tranquil; /* fsync check thread is shutdown or sleeping */ + volatile byte salvsync_fatal_error; /* fatal error with salvsync comm */ + + /* some command-line options we use in + * various places + * + * these fields are immutable once we + * go multithreaded */ + struct { + byte fs_state_save; + byte fs_state_restore; + byte fs_state_verify_before_save; + byte fs_state_verify_after_restore; + } options; + + pthread_cond_t worker_done_cv; + pthread_rwlock_t state_lock; +}; + +extern struct fs_state fs_state; + +/* this lock is defined to be directly above FS_LOCK in the locking hierarchy */ +#define FS_STATE_RDLOCK assert(pthread_rwlock_rdlock(&fs_state.state_lock) == 0) +#define FS_STATE_WRLOCK assert(pthread_rwlock_wrlock(&fs_state.state_lock) == 0) +#define FS_STATE_UNLOCK assert(pthread_rwlock_unlock(&fs_state.state_lock) == 0) + +#define FS_MODE_NORMAL 0 +#define FS_MODE_SHUTDOWN 1 +#endif /* AFS_DEMAND_ATTACH_FS */ + + +#endif /* _AFS_VICED_VICED_H */ diff --git a/src/viced/viced_prototypes.h b/src/viced/viced_prototypes.h index df11f8aa5b..556d3500c5 100644 --- a/src/viced/viced_prototypes.h +++ b/src/viced/viced_prototypes.h @@ -1,4 +1,27 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +#ifndef _AFS_VICED_VICED_PROTOTYPES_H +#define _AFS_VICED_VICED_PROTOTYPES_H + extern int sendBufSize; afs_int32 sys_error_to_et(afs_int32 in); void init_sys_error_to_et(void); + +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * fileserver state serialization + */ +extern int fs_stateSave(void); +extern int fs_stateRestore(void); +#endif /* AFS_DEMAND_ATTACH_FS */ + +#endif /* _AFS_VICED_VICED_PROTOTYPES_H */ diff --git a/src/vol/Makefile.in b/src/vol/Makefile.in index 114a304997..33131a0600 100644 --- a/src/vol/Makefile.in +++ b/src/vol/Makefile.in @@ -16,22 +16,23 @@ LIBS=${TOP_LIBDIR}/libcmd.a vlib.a ${TOP_LIBDIR}/util.a \ ${TOP_LIBDIR}/libsys.a ${TOP_LIBDIR}/libdir.a \ ${TOP_LIBDIR}/liblwp.a ${TOP_LIBDIR}/libacl.a -CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS} +CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS} -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT -PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h\ - fssync.h ihandle.h namei_ops.h +PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h \ + fssync.h ihandle.h namei_ops.h salvsync.h daemon_com.h -VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync.o purge.o \ - clone.o nuke.o devname.o listinodes.o common.o ihandle.o \ - namei_ops.o +VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-server.o fssync-client.o \ + clone.o nuke.o devname.o listinodes.o common.o ihandle.o purge.o \ + namei_ops.o salvsync-server.o salvsync-client.o daemon_com.o -OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o +OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o fssync-debug.o all: gi \ ${TOP_LIBDIR}/vlib.a \ ${TOP_LIBDIR}/libvlib.a \ salvager \ volinfo \ + fssync-debug \ $(FS_CONV_OSF40D) \ $(XFS_SIZE_CHECK) \ $(FS_CONV_SOL26) \ @@ -42,6 +43,8 @@ all: gi \ ${TOP_INCDIR}/afs/voldefs.h \ ${TOP_INCDIR}/afs/partition.h \ ${TOP_INCDIR}/afs/fssync.h \ + ${TOP_INCDIR}/afs/salvsync.h \ + ${TOP_INCDIR}/afs/daemon_com.h \ ${TOP_INCDIR}/afs/ihandle.h \ ${TOP_INCDIR}/afs/namei_ops.h @@ -53,6 +56,7 @@ install: \ ${DESTDIR}${libdir}/afs/libvlib.a \ ${DESTDIR}${afssrvlibexecdir}/salvager \ ${DESTDIR}${afssrvsbindir}/volinfo \ + ${DESTDIR}${afssrvsbindir}/fssync-debug \ $(install_FS_CONV_OSF40D) \ $(install_XFS_SIZE_CHECK) \ $(install_FS_CONV_SOL26) \ @@ -63,6 +67,8 @@ install: \ ${DESTDIR}${includedir}/afs/voldefs.h \ ${DESTDIR}${includedir}/afs/partition.h \ ${DESTDIR}${includedir}/afs/fssync.h \ + ${DESTDIR}${includedir}/afs/salvsync.h \ + ${DESTDIR}${includedir}/afs/daemon_com.h \ ${DESTDIR}${includedir}/afs/ihandle.h \ ${DESTDIR}${includedir}/afs/namei_ops.h @@ -72,6 +78,11 @@ ${DEST}/root.server/usr/afs/bin/salvager: salvager ${DEST}/root.server/usr/afs/bin/volinfo: volinfo ${INSTALL} -s $? $@ +${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug + if test "@DEMAND_ATTACH@" = "no"; then \ + ${INSTALL} -s $? $@ ; \ + fi + ${DEST}/lib/afs/vlib.a: vlib.a ${INSTALL} $? $@ @@ -117,6 +128,12 @@ ${DEST}/include/afs/partition.h: partition.h ${DEST}/include/afs/fssync.h: fssync.h ${INSTALL} $? $@ +${DEST}/include/afs/salvsync.h: salvsync.h + ${INSTALL} $? $@ + +${DEST}/include/afs/daemon_com.h: daemon_com.h + ${INSTALL} $? $@ + ${DEST}/include/afs/ihandle.h: ihandle.h ${INSTALL} $? $@ @@ -129,6 +146,8 @@ ${DEST}/include/afs/namei_ops.h: namei_ops.h ${OBJECTS}: ${PUBLICHEADERS} ${TOP_INCDIR}/lwp.h ${TOP_INCDIR}/lock.h ${TOP_INCDIR}/afs/afsint.h vutils.h salvage.h AFS_component_version_number.c vol-salvage.o vutil.o: volinodes.h +vol-salvage.o salvager.o: vol-salvage.h +vol-salvage.o: salvsync.h daemon_com.h vlib.a: ${VLIBOBJS} AFS_component_version_number.o $(RM) -f $@ @@ -136,8 +155,8 @@ vlib.a: ${VLIBOBJS} AFS_component_version_number.o $(RANLIB) $@ # new salvager: remove references to /vice by linking with novice.o -salvager: vol-salvage.o physio.o vlib.a - ${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o ${LIBS} ${XLIBS} +salvager: vol-salvage.o physio.o vlib.a salvager.o ${LIBS} + ${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o salvager.o ${LIBS} ${XLIBS} vol-salvage: vol-salvage.o vol-info: vol-info.o physio.o ihandle.o @@ -167,13 +186,16 @@ volinfo: vol-info.o physio.o ihandle.o ${LIBS} ${CC} ${CFLAGS} -o volinfo vol-info.o physio.o \ ihandle.o ${LIBS} ${XLIBS} +fssync-debug: fssync-debug.o physio.o AFS_component_version_number.c ${LIBS} + ${CC} ${LDFLAGS} -o fssync-debug fssync-debug.o physio.o ${LIBS} ${XLIBS} + vol-bless: vol-bless.o physio.o ihandle.o ${LIBS} ${CC} ${CFLAGS} -o vol-bless vol-bless.o physio.o ${LIBS} ${XLIBS} -fs_conv_dux40D: fs_conv_411.o +fs_conv_dux40D: fs_conv_411.o ${LIBS} ${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_dux40D fs_conv_411.o ${LIBS} ${XLIBS} -fs_conv_sol26: fs_conv_411.o vlib.a +fs_conv_sol26: fs_conv_411.o ${LIBS} ${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_sol26 fs_conv_411.o ${LIBS} ${XLIBS} fs_conv_411.o: fs_conv_411.c AFS_component_version_number.c @@ -211,6 +233,11 @@ ${DESTDIR}${afssrvlibexecdir}/salvager: salvager ${DESTDIR}${afssrvsbindir}/volinfo: volinfo ${INSTALL} -s $? $@ +${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug + if test "@DEMAND_ATTACH@" = "no" ; then \ + ${INSTALL} -s $? $@ ; \ + fi + ${DESTDIR}${includedir}/afs/nfs.h: nfs.h ${INSTALL} $? $@ @@ -253,6 +280,18 @@ ${DESTDIR}${includedir}/afs/fssync.h: fssync.h ${TOP_INCDIR}/afs/fssync.h: fssync.h ${INSTALL} $? $@ +${DESTDIR}${includedir}/afs/salvsync.h: salvsync.h + ${INSTALL} $? $@ + +${TOP_INCDIR}/afs/salvsync.h: salvsync.h + ${INSTALL} $? $@ + +${DESTDIR}${includedir}/afs/daemon_com.h: daemon_com.h + ${INSTALL} $? $@ + +${TOP_INCDIR}/afs/daemon_com.h: daemon_com.h + ${INSTALL} $? $@ + ${DESTDIR}${includedir}/afs/ihandle.h: ihandle.h ${INSTALL} $? $@ @@ -265,11 +304,24 @@ ${DESTDIR}${includedir}/afs/namei_ops.h: namei_ops.h ${TOP_INCDIR}/afs/namei_ops.h: namei_ops.h ${INSTALL} $? $@ +${DESTDIR}${includedir}/afs/salvage.h: salvage.h + ${INSTALL} $? $@ + +${TOP_INCDIR}/afs/salvage.h: salvage.h + ${INSTALL} $? $@ + +${DESTDIR}${includedir}/afs/vol-salvage.h: vol-salvage.h + ${INSTALL} $? $@ + +${TOP_INCDIR}/afs/vol-salvage.h: vol-salvage.h + ${INSTALL} $? $@ + dest: \ ${DEST}/lib/afs/vlib.a \ ${DEST}/lib/afs/libvlib.a \ ${DEST}/root.server/usr/afs/bin/salvager \ ${DEST}/root.server/usr/afs/bin/volinfo \ + ${DEST}/root.server/usr/afs/bin/fssync-debug \ $(dest_FS_CONV_OSF40D) \ $(dest_XFS_SIZE_CHECK) \ $(dest_FS_CONV_SOL26) \ @@ -280,12 +332,14 @@ dest: \ ${DEST}/include/afs/voldefs.h \ ${DEST}/include/afs/partition.h \ ${DEST}/include/afs/fssync.h \ + ${DEST}/include/afs/salvsync.h \ + ${DEST}/include/afs/daemon_com.h \ ${DEST}/include/afs/ihandle.h \ ${DEST}/include/afs/namei_ops.h check-splint:: sh $(HELPER_SPLINT) $(CFLAGS) \ - vnode.c volume.c vutil.c partition.c fssync.c purge.c \ + vnode.c volume.c vutil.c partition.c fssync-server.c fssync-client.c \ clone.c nuke.c devname.c listinodes.c common.c ihandle.c \ - namei_ops.c \ - physio.c vol-salvage.c vol-info.c vol-bless.c + namei_ops.c salvsync-server.c salvsync-client.c daemon_com.c purge.c \ + physio.c vol-salvage.c vol-info.c vol-bless.c fssync-debug.c diff --git a/src/vol/NTMakefile b/src/vol/NTMakefile index e09db2b734..096026fe7a 100644 --- a/src/vol/NTMakefile +++ b/src/vol/NTMakefile @@ -5,6 +5,8 @@ # License. For details, see the LICENSE file in the top-level source # directory or online at http://www.openafs.org/dl/license10.html +AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT + RELDIR=vol !INCLUDE ..\config\NTMakefile.$(SYS_NAME) !INCLUDE ..\config\NTMakefile.version diff --git a/src/vol/daemon_com.c b/src/vol/daemon_com.c new file mode 100644 index 0000000000..26bddbf6c9 --- /dev/null +++ b/src/vol/daemon_com.c @@ -0,0 +1,473 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * localhost interprocess communication for servers + * + * currently handled by a localhost socket + * (yes, this needs to be replaced someday) + */ + +#ifndef _WIN32 +#define FD_SETSIZE 65536 +#endif + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#include +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + + +#include +#include +#include "nfs.h" +#include +#include "daemon_com.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include + +/*@printflike@*/ extern void Log(const char *format, ...); + +#ifdef osi_Assert +#undef osi_Assert +#endif +#define osi_Assert(e) (void)(e) + +int (*V_BreakVolumeCallbacks) (); + +#define MAXHANDLERS 4 /* Up to 4 clients; must be at least 2, so that + * move = dump+restore can run on single server */ + +#define MAX_BIND_TRIES 5 /* Number of times to retry socket bind */ + +static int getport(SYNC_client_state * state, struct sockaddr_in *addr); +static int SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res); + +/* daemon com SYNC client interface */ + +int +SYNC_connect(SYNC_client_state * state) +{ + struct sockaddr_in addr; + /* I can't believe the following is needed for localhost connections!! */ + static time_t backoff[] = + { 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 }; + time_t *timeout = &backoff[0]; + + if (state->fd >= 0) { + return 1; + } + + for (;;) { + state->fd = getport(state, &addr); + if (connect(state->fd, (struct sockaddr *)&addr, sizeof(addr)) >= 0) + return 1; + if (!*timeout) + break; + if (!(*timeout & 1)) + Log("SYNC_connect temporary failure (will retry)\n"); + SYNC_disconnect(state); + sleep(*timeout++); + } + perror("SYNC_connect failed (giving up!)"); + return 0; +} + +int +SYNC_disconnect(SYNC_client_state * state) +{ +#ifdef AFS_NT40_ENV + closesocket(state->fd); +#else + close(state->fd); +#endif + state->fd = -1; + return 0; +} + +afs_int32 +SYNC_closeChannel(SYNC_client_state * state) +{ + afs_int32 code; + SYNC_command com; + SYNC_response res; + SYNC_PROTO_BUF_DECL(ores); + + if (state->fd == -1) + return SYNC_OK; + + memset(&com, 0, sizeof(com)); + memset(&res, 0, sizeof(res)); + + res.payload.len = SYNC_PROTO_MAX_LEN; + res.payload.buf = ores; + + com.hdr.command = SYNC_COM_CHANNEL_CLOSE; + com.hdr.command_len = sizeof(SYNC_command_hdr); + + /* in case the other end dropped, don't do any retries */ + state->retry_limit = 0; + state->hard_timeout = 0; + + code = SYNC_ask(state, &com, &res); + + if (code == SYNC_OK) { + if (res.hdr.response != SYNC_OK) { + Log("SYNC_closeChannel: channel shutdown request denied; closing socket anyway\n"); + } else if (!(res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN)) { + Log("SYNC_closeChannel: channel shutdown request mishandled by server\n"); + } + } else { + Log("SYNC_closeChannel: channel communications problem"); + } + + SYNC_disconnect(state); + + return code; +} + +int +SYNC_reconnect(SYNC_client_state * state) +{ + SYNC_disconnect(state); + return SYNC_connect(state); +} + +/* private function to fill in the sockaddr struct for us */ +static int +getport(SYNC_client_state * state, struct sockaddr_in *addr) +{ + int sd; + + memset(addr, 0, sizeof(*addr)); + assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); +#ifdef STRUCT_SOCKADDR_HAS_SA_LEN + addr->sin_len = sizeof(struct sockaddr_in); +#endif + addr->sin_addr.s_addr = htonl(0x7f000001); + addr->sin_family = AF_INET; /* was localhost->h_addrtype */ + addr->sin_port = htons(state->port); /* XXXX htons not _really_ neccessary */ + + return sd; +} + +afs_int32 +SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res) +{ + int tries; + afs_uint32 now, timeout, code=SYNC_OK; + + if (state->fatal_error) { + return SYNC_COM_ERROR; + } + + if (state->fd == -1) { + SYNC_connect(state); + } + + if (state->fd == -1) { + state->fatal_error = 1; + return SYNC_COM_ERROR; + } + +#ifdef AFS_DEMAND_ATTACH_FS + com->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS; +#endif + + now = FT_ApproxTime(); + timeout = now + state->hard_timeout; + for (tries = 0; + (tries <= state->retry_limit) && (now <= timeout); + tries++, now = FT_ApproxTime()) { + code = SYNC_ask_internal(state, com, res); + if (code == SYNC_OK) { + break; + } else if (code == SYNC_BAD_COMMAND) { + Log("SYNC_ask: protocol mismatch; make sure fileserver, volserver, salvageserver and salvager are same version\n"); + break; + } else if (code == SYNC_COM_ERROR) { + Log("SYNC_ask: protocol communications failure; attempting reconnect to server\n"); + SYNC_reconnect(state); + /* try again */ + } else { + /* unknown (probably protocol-specific) response code, pass it up to the caller, and let them deal with it */ + break; + } + } + + if (code == SYNC_COM_ERROR) { + Log("SYNC_ask: fatal protocol error; disabling sync protocol to server running on port %d until next server restart\n", + state->port); + state->fatal_error = 1; + } + + return code; +} + +static afs_int32 +SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res) +{ + int n; + SYNC_PROTO_BUF_DECL(buf); +#ifndef AFS_NT40_ENV + int iovcnt; + struct iovec iov[2]; +#endif + + if (state->fd == -1) { + Log("SYNC_ask: invalid sync file descriptor\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + + if (com->hdr.command_len > SYNC_PROTO_MAX_LEN) { + Log("SYNC_ask: internal SYNC buffer too small; please file a bug\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + + com->hdr.proto_version = state->proto_version; + + memcpy(buf, &com->hdr, sizeof(com->hdr)); + if (com->payload.len) { + memcpy(buf + sizeof(com->hdr), com->payload.buf, + com->hdr.command_len - sizeof(com->hdr)); + } + +#ifdef AFS_NT40_ENV + n = send(state->fd, buf, com->hdr.command_len, 0); + if (n != com->hdr.command_len) { + Log("SYNC_ask: write failed\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + + n = recv(state->fd, buf, SYNC_PROTO_MAX_LEN, 0); + if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) { + Log("SYNC_ask: No response\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } +#else /* !AFS_NT40_ENV */ + n = write(state->fd, buf, com->hdr.command_len); + if (com->hdr.command_len != n) { + Log("SYNC_ask: write failed\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + + /* receive the response */ + iov[0].iov_base = (char *)&res->hdr; + iov[0].iov_len = sizeof(res->hdr); + if (res->payload.len) { + iov[1].iov_base = (char *)res->payload.buf; + iov[1].iov_len = res->payload.len; + iovcnt = 2; + } else { + iovcnt = 1; + } + n = readv(state->fd, iov, iovcnt); + if (n == 0 || (n < 0 && errno != EINTR)) { + Log("SYNC_ask: No response\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } +#endif /* !AFS_NT40_ENV */ + + res->recv_len = n; + + if (n < sizeof(res->hdr)) { + Log("SYNC_ask: response too short\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } +#ifdef AFS_NT40_ENV + memcpy(&res->hdr, buf, sizeof(res->hdr)); +#endif + + if ((n - sizeof(res->hdr)) > res->payload.len) { + Log("SYNC_ask: response too long\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } +#ifdef AFS_NT40_ENV + memcpy(res->payload.buf, buf + sizeof(res->hdr), n - sizeof(res->hdr)); +#endif + + if (res->hdr.response_len != n) { + Log("SYNC_ask: length field in response inconsistent\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + if (res->hdr.response == SYNC_DENIED) { + Log("SYNC_ask: negative response\n"); + } + + done: + return res->hdr.response; +} + + +/* + * daemon com SYNC server-side interfaces + */ + +/* get a command */ +afs_int32 +SYNC_getCom(int fd, SYNC_command * com) +{ + int n; + afs_int32 code = SYNC_OK; +#ifdef AFS_NT40_ENV + SYNC_PROTO_BUF_DECL(buf); +#else + struct iovec iov[2]; + int iovcnt; +#endif + +#ifdef AFS_NT40_ENV + n = recv(fd, buf, SYNC_PROTO_MAX_LEN, 0); + + if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) { + Log("SYNC_getCom: error receiving command\n"); + code = SYNC_COM_ERROR; + goto done; + } +#else /* !AFS_NT40_ENV */ + iov[0].iov_base = (char *)&com->hdr; + iov[0].iov_len = sizeof(com->hdr); + if (com->payload.len) { + iov[1].iov_base = (char *)com->payload.buf; + iov[1].iov_len = com->payload.len; + iovcnt = 2; + } else { + iovcnt = 1; + } + + n = readv(fd, iov, iovcnt); + if (n == 0 || (n < 0 && errno != EINTR)) { + Log("SYNC_getCom: error receiving command\n"); + code = SYNC_COM_ERROR; + goto done; + } +#endif /* !AFS_NT40_ENV */ + + com->recv_len = n; + + if (n < sizeof(com->hdr)) { + Log("SYNC_getCom: command too short\n"); + code = SYNC_COM_ERROR; + goto done; + } +#ifdef AFS_NT40_ENV + memcpy(&com->hdr, buf, sizeof(com->hdr)); +#endif + + if ((n - sizeof(com->hdr)) > com->payload.len) { + Log("SYNC_getCom: command too long\n"); + code = SYNC_COM_ERROR; + goto done; + } +#ifdef AFS_NT40_ENV + memcpy(com->payload.buf, buf + sizeof(com->hdr), n - sizeof(com->hdr)); +#endif + + done: + return code; +} + +/* put a response */ +afs_int32 +SYNC_putRes(int fd, SYNC_response * res) +{ + int n; + afs_int32 code = SYNC_OK; + SYNC_PROTO_BUF_DECL(buf); + + if (res->hdr.response_len > (sizeof(res->hdr) + res->payload.len)) { + Log("SYNC_putRes: response_len field in response header inconsistent\n"); + code = SYNC_COM_ERROR; + goto done; + } + + if (res->hdr.response_len > SYNC_PROTO_MAX_LEN) { + Log("SYNC_putRes: internal SYNC buffer too small; please file a bug\n"); + code = SYNC_COM_ERROR; + goto done; + } + +#ifdef AFS_DEMAND_ATTACH_FS + res->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS; +#endif + + memcpy(buf, &res->hdr, sizeof(res->hdr)); + if (res->payload.len) { + memcpy(buf + sizeof(res->hdr), res->payload.buf, + res->hdr.response_len - sizeof(res->hdr)); + } + +#ifdef AFS_NT40_ENV + n = send(fd, buf, res->hdr.response_len, 0); +#else /* !AFS_NT40_ENV */ + n = write(fd, buf, res->hdr.response_len); +#endif /* !AFS_NT40_ENV */ + + if (res->hdr.response_len != n) { + Log("SYNC_putRes: write failed\n"); + res->hdr.response = SYNC_COM_ERROR; + goto done; + } + + done: + return code; +} + +/* return 0 for legal (null-terminated) string, + * 1 for illegal (unterminated) string */ +int +SYNC_verifyProtocolString(char * buf, size_t len) +{ + int ret = 0; + size_t s_len; + + s_len = afs_strnlen(buf, len); + + return (s_len == len) ? 1 : 0; +} diff --git a/src/vol/daemon_com.h b/src/vol/daemon_com.h new file mode 100644 index 0000000000..846436783f --- /dev/null +++ b/src/vol/daemon_com.h @@ -0,0 +1,141 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +#ifndef _AFS_VOL_DAEMON_COM_H +#define _AFS_VOL_DAEMON_COM_H + +/* + * SYNC protocol constants + */ + +/* SYNC protocol command codes + * + * command codes 0-65535 are reserved for + * global SYNC package command codes + */ +#define SYNC_COM_CODE_USER_BASE 65536 +#define SYNC_COM_CODE_DECL(code) (SYNC_COM_CODE_USER_BASE+(code)) + +/* general command codes */ +#define SYNC_COM_CHANNEL_CLOSE 0 + + +/* SYNC protocol response codes + * + * response codes 0-65535 are reserved for + * global SYNC package response codes + */ +#define SYNC_RES_CODE_USER_BASE 65536 +#define SYNC_RES_CODE_DECL(code) (SYNC_RES_CODE_USER_BASE+(code)) + +/* general response codes */ +#define SYNC_OK 0 /* sync call returned ok */ +#define SYNC_DENIED 1 /* sync request denied by server */ +#define SYNC_COM_ERROR 2 /* sync protocol communicaions error */ +#define SYNC_BAD_COMMAND 3 /* sync command code not implemented by server */ +#define SYNC_FAILED 4 /* sync server-side procedure failed */ + + +/* SYNC protocol reason codes + * + * reason codes 0-65535 are reserved for + * global SYNC package reason codes + */ +#define SYNC_REASON_CODE_USER_BASE 65536 +#define SYNC_REASON_CODE_DECL(code) (SYNC_REASON_CODE_USER_BASE+(code)) + +/* general reason codes */ +#define SYNC_REASON_NONE 0 +#define SYNC_REASON_MALFORMED_PACKET 1 + + +/* SYNC protocol flags + * + * flag bits 0-7 are reserved for + * global SYNC package flags + */ +#define SYNC_FLAG_CODE_USER_BASE 8 +#define SYNC_FLAG_CODE_DECL(code) (1 << (SYNC_FLAG_CODE_USER_BASE+(code))) + +/* general flag codes */ +#define SYNC_FLAG_CHANNEL_SHUTDOWN 0x1 +#define SYNC_FLAG_DAFS_EXTENSIONS 0x2 /* signal that other end of socket is compiled + * with demand attach extensions */ + +/* SYNC protocol response buffers */ +#define SYNC_PROTO_MAX_LEN 768 /* maximum size of sync protocol message */ + +/* use a large type to get proper buffer alignment so we can safely cast the pointer */ +#define SYNC_PROTO_BUF_DECL(buf) \ + afs_int64 _##buf##_l[SYNC_PROTO_MAX_LEN/sizeof(afs_int64)]; \ + char * buf = (char *)(_##buf##_l) + + +/* client-side state object */ +typedef struct SYNC_client_state { + int fd; + afs_uint16 port; + afs_uint32 proto_version; + int retry_limit; /* max number of times for SYNC_ask to retry */ + afs_int32 hard_timeout; /* upper limit on time to keep trying */ + byte fatal_error; /* fatal error on this client conn */ +} SYNC_client_state; + +/* wire types */ +typedef struct SYNC_command_hdr { + afs_uint32 proto_version; /* sync protocol version */ + afs_int32 programType; /* type of program issuing the request */ + afs_int32 command; /* request type */ + afs_int32 reason; /* reason for request */ + afs_uint32 command_len; /* entire length of command */ + afs_uint32 flags; +} SYNC_command_hdr; + +typedef struct SYNC_response_hdr { + afs_uint32 proto_version; /* sync protocol version */ + afs_uint32 response_len; /* entire length of response */ + afs_int32 response; /* response code */ + afs_int32 reason; /* reason for response */ + afs_uint32 flags; +} SYNC_response_hdr; + + +/* user-visible types */ +typedef struct SYNC_command { + SYNC_command_hdr hdr; + struct { + afs_uint32 len; + void * buf; + } payload; + afs_int32 recv_len; +} SYNC_command; + +typedef struct SYNC_response { + SYNC_response_hdr hdr; + struct { + afs_uint32 len; + void * buf; + } payload; + afs_int32 recv_len; +} SYNC_response; + + +/* client-side prototypes */ +extern afs_int32 SYNC_ask(SYNC_client_state *, SYNC_command * com, SYNC_response * res); +extern int SYNC_connect(SYNC_client_state *); /* setup the channel */ +extern int SYNC_disconnect(SYNC_client_state *); /* just close the socket */ +extern afs_int32 SYNC_closeChannel(SYNC_client_state *); /* do a graceful channel close */ +extern int SYNC_reconnect(SYNC_client_state *); /* do a reconnect after a protocol error, or from a forked child */ + +/* server-side prototypes */ +extern int SYNC_getCom(int fd, SYNC_command * com); +extern int SYNC_putRes(int fd, SYNC_response * res); +extern int SYNC_verifyProtocolString(char * buf, size_t len); + +#endif /* _AFS_VOL_DAEMON_COM_H */ diff --git a/src/vol/fssync-client.c b/src/vol/fssync-client.c new file mode 100644 index 0000000000..205a08953d --- /dev/null +++ b/src/vol/fssync-client.c @@ -0,0 +1,222 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates + */ + +/* + System: VICE-TWO + Module: fssync.c + Institution: The Information Technology Center, Carnegie-Mellon University + + */ +#ifdef notdef + +/* All this is going away in early 1989 */ +int newVLDB; /* Compatibility flag */ + +#endif +static int newVLDB = 1; + + +#ifndef AFS_PTHREAD_ENV +#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2) + +/* + * stack size increased from 8K because the HP machine seemed to have trouble + * with the smaller stack + */ +#define USUAL_STACK_SIZE (24 * 1024) +#endif /* !AFS_PTHREAD_ENV */ + +/* + fssync-client.c + File server synchronization with external volume utilities. + client-side implementation + */ + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#ifdef AFS_PTHREAD_ENV +#include +#else /* AFS_PTHREAD_ENV */ +#include +#endif /* AFS_PTHREAD_ENV */ +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + + +#include +#include +#include "nfs.h" +#include +#include "daemon_com.h" +#include "fssync.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" + +#ifdef FSSYNC_BUILD_CLIENT + +/*@printflike@*/ extern void Log(const char *format, ...); + +#ifdef osi_Assert +#undef osi_Assert +#endif +#define osi_Assert(e) (void)(e) + +extern int LogLevel; + +static SYNC_client_state fssync_state = { -1, 2040, FSYNC_PROTO_VERSION, 5, 120 }; + +#ifdef AFS_PTHREAD_ENV +static pthread_mutex_t vol_fsync_mutex; +static volatile vol_fsync_mutex_init = 0; +#define VFSYNC_LOCK \ + assert(pthread_mutex_lock(&vol_fsync_mutex) == 0) +#define VFSYNC_UNLOCK \ + assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0) +#else +#define VFSYNC_LOCK +#define VFSYNC_UNLOCK +#endif + +int +FSYNC_clientInit(void) +{ +#ifdef AFS_PTHREAD_ENV + /* this is safe since it gets called with VOL_LOCK held, or before we go multithreaded */ + if (!vol_fsync_mutex_init) { + assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0); + vol_fsync_mutex_init = 1; + } +#endif + return SYNC_connect(&fssync_state); +} + +void +FSYNC_clientFinis(void) +{ + SYNC_closeChannel(&fssync_state); +} + +int +FSYNC_clientChildProcReconnect(void) +{ + return SYNC_reconnect(&fssync_state); +} + +/* fsync client interface */ +afs_int32 +FSYNC_askfs(SYNC_command * com, SYNC_response * res) +{ + afs_int32 code; + + VFSYNC_LOCK; + code = SYNC_ask(&fssync_state, com, res); + VFSYNC_UNLOCK; + + switch (code) { + case SYNC_OK: + case SYNC_FAILED: + break; + case SYNC_COM_ERROR: + case SYNC_BAD_COMMAND: + Log("FSYNC_askfs: fatal FSSYNC protocol error; volume management functionality disabled until next fileserver restart\n"); + break; + case SYNC_DENIED: + Log("FSYNC_askfs: FSSYNC request denied for reason=%d\n", res->hdr.reason); + break; + default: + Log("FSYNC_askfs: unknown protocol response %d\n", code); + break; + } + return code; +} + +afs_int32 +FSYNC_GenericOp(void * ext_hdr, size_t ext_len, + int command, int reason, + SYNC_response * res_in) +{ + SYNC_response res_l, *res; + SYNC_command com; + + if (res_in) { + res = res_in; + } else { + res = &res_l; + res_l.payload.buf = NULL; + res_l.payload.len = 0; + } + + memset(&com, 0, sizeof(com)); + + com.hdr.programType = programType; + com.hdr.command = command; + com.hdr.reason = reason; + com.hdr.command_len = sizeof(com.hdr) + ext_len; + com.payload.buf = ext_hdr; + com.payload.len = ext_len; + + return FSYNC_askfs(&com, res); +} + +afs_int32 +FSYNC_VolOp(VolumeId volume, char * partition, + int command, int reason, + SYNC_response * res) +{ + FSSYNC_VolOp_hdr vcom; + + memset(&vcom, 0, sizeof(vcom)); + + vcom.volume = volume; + if (partition) + strlcpy(vcom.partName, partition, sizeof(vcom.partName)); + + return FSYNC_GenericOp(&vcom, sizeof(vcom), command, reason, res); +} + +afs_int32 +FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason, + SYNC_response * res) +{ + return FSYNC_GenericOp(scom, sizeof(*scom), command, reason, res); +} + + +#endif /* FSSYNC_BUILD_CLIENT */ diff --git a/src/vol/fssync-debug.c b/src/vol/fssync-debug.c new file mode 100644 index 0000000000..194204e8ba --- /dev/null +++ b/src/vol/fssync-debug.c @@ -0,0 +1,1148 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* Main program file. Define globals. */ +#define MAIN 1 + +/* + * fssync administration tool + */ + + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include +#include +#include +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#ifndef ITIMER_REAL +#include +#endif /* ITIMER_REAL */ +#endif +#include +#include +#include + + +#include + +#ifndef AFS_NT40_ENV +#include +#endif + +#include +#include +#include + +#include "nfs.h" +#include "lwp.h" +#include "lock.h" +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include "daemon_com.h" +#include "fssync.h" +#ifdef AFS_NT40_ENV +#include +#endif + +int VolumeChanged; /* hack to make dir package happy */ + + +struct volop_state { + afs_uint32 volume; + char partName[16]; +}; + +struct state { + afs_int32 reason; + struct volop_state * vop; +}; + +static int common_prolog(struct cmd_syndesc *, struct state *); +static int common_volop_prolog(struct cmd_syndesc *, struct state *); + +static int do_volop(struct state *, afs_int32 command, SYNC_response * res); + +static char * response_code_to_string(afs_int32); +static char * command_code_to_string(afs_int32); +static char * reason_code_to_string(afs_int32); +static char * program_type_to_string(afs_int32); + +static int VolOnline(struct cmd_syndesc * as, char * rock); +static int VolOffline(struct cmd_syndesc * as, char * rock); +static int VolMode(struct cmd_syndesc * as, char * rock); +static int VolDetach(struct cmd_syndesc * as, char * rock); +static int VolBreakCBKs(struct cmd_syndesc * as, char * rock); +static int VolMove(struct cmd_syndesc * as, char * rock); +static int VolList(struct cmd_syndesc * as, char * rock); +static int VolQuery(struct cmd_syndesc * as, char * rock); +static int VolHdrQuery(struct cmd_syndesc * as, char * rock); +static int VolOpQuery(struct cmd_syndesc * as, char * rock); +static int StatsQuery(struct cmd_syndesc * as, char * rock); + + +static void print_vol_stats_general(VolPkgStats * stats); +static void print_vol_stats_viceP(struct DiskPartitionStats * stats); +static void print_vol_stats_hash(struct VolumeHashChainStats * stats); +#ifdef AFS_DEMAND_ATTACH_FS +static void print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats); +#endif + +#ifndef AFS_NT40_ENV +#include "AFS_component_version_number.c" +#endif +#define MAX_ARGS 128 + +#define COMMON_PARMS_OFFSET 12 +#define COMMON_PARMS(ts) \ + cmd_Seek(ts, COMMON_PARMS_OFFSET); \ + cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \ + cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code") + +#define COMMON_VOLOP_PARMS_OFFSET 10 +#define COMMON_VOLOP_PARMS(ts) \ + cmd_Seek(ts, COMMON_VOLOP_PARMS_OFFSET); \ + cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \ + cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name") + +#define CUSTOM_PARMS_OFFSET 1 + + +#define VOLOP_PARMS_DECL(ts) \ + COMMON_VOLOP_PARMS(ts); \ + COMMON_PARMS(ts) +#define COMMON_PARMS_DECL(ts) \ + COMMON_PARMS(ts) + +int +main(int argc, char **argv) +{ + struct cmd_syndesc *ts; + int err = 0; + int i; + extern char cml_version_number[]; + + /* Initialize directory paths */ + if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) { +#ifdef AFS_NT40_ENV + ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0); +#endif + fprintf(stderr, "%s: Unable to obtain AFS server directory.\n", + argv[0]); + exit(2); + } + + + ts = cmd_CreateSyntax("online", VolOnline, 0, "bring a volume online (FSYNC_VOL_ON opcode)"); + VOLOP_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("offline", VolOffline, 0, "take a volume offline (FSYNC_VOL_OFF opcode)"); + VOLOP_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("mode", VolMode, 0, "change volume attach mode (FSYNC_VOL_NEEDVOLUME opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "needvolume"); + + ts = cmd_CreateSyntax("detach", VolDetach, 0, "detach a volume (FSYNC_VOL_DONE opcode)"); + VOLOP_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("callback", VolBreakCBKs, 0, "break callbacks for volume (FSYNC_VOL_BREAKCBKS opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "cbk"); + + ts = cmd_CreateSyntax("move", VolMove, 0, "set volume moved flag (FSYNC_VOL_MOVE opcode)"); + VOLOP_PARMS_DECL(ts); + + ts = cmd_CreateSyntax("list", VolList, 0, "sync local volume list (FSYNC_VOL_LISTVOLUMES opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "ls"); + + ts = cmd_CreateSyntax("query", VolQuery, 0, "get volume structure (FSYNC_VOL_QUERY opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "qry"); + + ts = cmd_CreateSyntax("header", VolHdrQuery, 0, "get volume disk data structure (FSYNC_VOL_QUERY_HDR opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "hdr"); + + ts = cmd_CreateSyntax("volop", VolOpQuery, 0, "get pending volume operation info (FSYNC_VOL_QUERY_VOP opcode)"); + VOLOP_PARMS_DECL(ts); + cmd_CreateAlias(ts, "vop"); + + ts = cmd_CreateSyntax("stats", StatsQuery, 0, "see 'stats help' for more information"); + cmd_Seek(ts, CUSTOM_PARMS_OFFSET); + cmd_AddParm(ts, "-cmd", CMD_SINGLE, 0, "subcommand"); + cmd_AddParm(ts, "-arg1", CMD_SINGLE, CMD_OPTIONAL, "arg1"); + cmd_AddParm(ts, "-arg2", CMD_SINGLE, CMD_OPTIONAL, "arg2"); + COMMON_PARMS_DECL(ts); + + err = cmd_Dispatch(argc, argv); + exit(err); +} + +static int +common_prolog(struct cmd_syndesc * as, struct state * state) +{ + register struct cmd_item *ti; + +#ifdef AFS_NT40_ENV + if (afs_winsockInit() < 0) { + Exit(1); + } +#endif + + VInitVolumePackage(debugUtility, 1, 1, + DONT_CONNECT_FS, 0); + DInit(1); + + if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) { /* -reason */ + state->reason = atoi(ti->data); + } + if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) { /* -programtype */ + if (!strcmp(ti->data, "fileServer")) { + programType = fileServer; + } else if (!strcmp(ti->data, "volumeUtility")) { + programType = volumeUtility; + } else if (!strcmp(ti->data, "salvager")) { + programType = salvager; + } else if (!strcmp(ti->data, "salvageServer")) { + programType = salvageServer; + } else { + programType = (ProgramType) atoi(ti->data); + } + } + + VConnectFS(); + + return 0; +} + +static int +common_volop_prolog(struct cmd_syndesc * as, struct state * state) +{ + register struct cmd_item *ti; + char pname[100], *temp; + + state->vop = (struct volop_state *) calloc(1, sizeof(struct volop_state)); + assert(state->vop != NULL); + + if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET].items)) { /* -volumeid */ + state->vop->volume = atoi(ti->data); + } else { + fprintf(stderr, "required argument -volumeid not given\n"); + } + + if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET+1].items)) { /* -partition */ + strlcpy(state->vop->partName, ti->data, sizeof(state->vop->partName)); + } else { + memset(state->vop->partName, 0, sizeof(state->vop->partName)); + } + + return 0; +} + +static int +do_volop(struct state * state, afs_int32 command, SYNC_response * res) +{ + afs_int32 code; + SYNC_PROTO_BUF_DECL(res_buf); + SYNC_response res_l; + + if (!res) { + res = &res_l; + res->payload.len = SYNC_PROTO_MAX_LEN; + res->payload.buf = res_buf; + } + + fprintf(stderr, "calling FSYNC_VolOp with command code %d (%s)\n", + command, command_code_to_string(command)); + + code = FSYNC_VolOp(state->vop->volume, + state->vop->partName, + command, + state->reason, + res); + + switch (code) { + case SYNC_OK: + case SYNC_DENIED: + break; + default: + fprintf(stderr, "possible sync protocol error. return code was %d\n", code); + } + + fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code)); + fprintf(stderr, "protocol response code was %d (%s)\n", + res->hdr.response, response_code_to_string(res->hdr.response)); + fprintf(stderr, "protocol reason code was %d (%s)\n", + res->hdr.reason, reason_code_to_string(res->hdr.reason)); + + VDisconnectFS(); +} + +static char * +response_code_to_string(afs_int32 response) +{ + switch (response) { + case SYNC_OK: + return "SYNC_OK"; + case SYNC_DENIED: + return "SYNC_DENIED"; + case SYNC_COM_ERROR: + return "SYNC_COM_ERROR"; + case SYNC_BAD_COMMAND: + return "SYNC_BAD_COMMAND"; + case SYNC_FAILED: + return "SYNC_FAILED"; + default: + return "**UNKNOWN**"; + } +} + +static char * +command_code_to_string(afs_int32 command) +{ + switch (command) { + case SYNC_COM_CHANNEL_CLOSE: + return "SYNC_COM_CHANNEL_CLOSE"; + case FSYNC_VOL_ON: + return "FSYNC_VOL_ON"; + case FSYNC_VOL_OFF: + return "FSYNC_VOL_OFF"; + case FSYNC_VOL_LISTVOLUMES: + return "FSYNC_VOL_LISTVOLUMES"; + case FSYNC_VOL_NEEDVOLUME: + return "FSYNC_VOL_NEEDVOLUME"; + case FSYNC_VOL_MOVE: + return "FSYNC_VOL_MOVE"; + case FSYNC_VOL_BREAKCBKS: + return "FSYNC_VOL_BREAKCBKS"; + case FSYNC_VOL_DONE: + return "FSYNC_VOL_DONE"; + case FSYNC_VOL_QUERY: + return "FSYNC_VOL_QUERY"; + case FSYNC_VOL_QUERY_HDR: + return "FSYNC_VOL_QUERY_HDR"; + case FSYNC_VOL_QUERY_VOP: + return "FSYNC_VOL_QUERY_VOP"; + case FSYNC_VOL_STATS_GENERAL: + return "FSYNC_VOL_STATS_GENERAL"; + case FSYNC_VOL_STATS_VICEP: + return "FSYNC_VOL_STATS_VICEP"; + case FSYNC_VOL_STATS_HASH: + return "FSYNC_VOL_STATS_HASH"; + case FSYNC_VOL_STATS_HDR: + return "FSYNC_VOL_STATS_HDR"; + case FSYNC_VOL_STATS_VLRU: + return "FSYNC_VOL_STATS_VLRU"; + default: + return "**UNKNOWN**"; + } +} + +static char * +reason_code_to_string(afs_int32 reason) +{ + switch (reason) { + case SYNC_REASON_NONE: + return "SYNC_REASON_NONE"; + case SYNC_REASON_MALFORMED_PACKET: + return "SYNC_REASON_MALFORMED_PACKET"; + case FSYNC_WHATEVER: + return "FSYNC_WHATEVER"; + case FSYNC_SALVAGE: + return "FSYNC_SALVAGE"; + case FSYNC_MOVE: + return "FSYNC_MOVE"; + case FSYNC_OPERATOR: + return "FSYNC_OPERATOR"; + case FSYNC_EXCLUSIVE: + return "FSYNC_EXCLUSIVE"; + case FSYNC_UNKNOWN_VOLID: + return "FSYNC_UNKNOWN_VOLID"; + case FSYNC_HDR_NOT_ATTACHED: + return "FSYNC_HDR_NOT_ATTACHED"; + case FSYNC_NO_PENDING_VOL_OP: + return "FSYNC_NO_PENDING_VOL_OP"; + case FSYNC_VOL_PKG_ERROR: + return "FSYNC_VOL_PKG_ERROR"; + default: + return "**UNKNOWN**"; + } +} + +static char * +program_type_to_string(afs_int32 type) +{ + switch ((ProgramType)type) { + case fileServer: + return "fileServer"; + case volumeUtility: + return "volumeUtility"; + case salvager: + return "salvager"; + case salvageServer: + return "salvageServer"; + case debugUtility: + return "debugUtility"; + default: + return "**UNKNOWN**"; + } +} + +static int +VolOnline(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_ON, NULL); + + return 0; +} + +static int +VolOffline(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_OFF, NULL); + + return 0; +} + +static int +VolMode(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_NEEDVOLUME, NULL); + + return 0; +} + +static int +VolDetach(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_DONE, NULL); + + return 0; +} + +static int +VolBreakCBKs(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_BREAKCBKS, NULL); + + return 0; +} + +static int +VolMove(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_MOVE, NULL); + + return 0; +} + +static int +VolList(struct cmd_syndesc * as, char * rock) +{ + struct state state; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_LISTVOLUMES, NULL); + + return 0; +} + +#ifdef AFS_DEMAND_ATTACH_FS +static char * +vol_state_to_string(VolState state) +{ + switch (state) { + case VOL_STATE_UNATTACHED: + return "VOL_STATE_UNATTACHED"; + case VOL_STATE_PREATTACHED: + return "VOL_STATE_PREATTACHED"; + case VOL_STATE_ATTACHING: + return "VOL_STATE_ATTACHING"; + case VOL_STATE_ATTACHED: + return "VOL_STATE_ATTACHED"; + case VOL_STATE_UPDATING: + return "VOL_STATE_UPDATING"; + case VOL_STATE_GET_BITMAP: + return "VOL_STATE_GET_BITMAP"; + case VOL_STATE_HDR_LOADING: + return "VOL_STATE_HDR_LOADING"; + case VOL_STATE_HDR_ATTACHING: + return "VOL_STATE_HDR_ATTACHING"; + case VOL_STATE_SHUTTING_DOWN: + return "VOL_STATE_SHUTTING_DOWN"; + case VOL_STATE_GOING_OFFLINE: + return "VOL_STATE_GOING_OFFLINE"; + case VOL_STATE_OFFLINING: + return "VOL_STATE_OFFLINING"; + case VOL_STATE_DETACHING: + return "VOL_STATE_DETACHING"; + case VOL_STATE_SALVSYNC_REQ: + return "VOL_STATE_SALVSYNC_REQ"; + case VOL_STATE_SALVAGING: + return "VOL_STATE_SALVAGING"; + case VOL_STATE_ERROR: + return "VOL_STATE_ERROR"; + case VOL_STATE_FREED: + return "VOL_STATE_FREED"; + default: + return "**UNKNOWN**"; + } +} + +static char * +vol_flags_to_string(afs_uint16 flags) +{ + static char str[128]; + int count = 0; + str[0]='\0'; + + if (flags & VOL_HDR_ATTACHED) { + strlcat(str, "VOL_HDR_ATTACHED", sizeof(str)); + count++; + } + + if (flags & VOL_HDR_LOADED) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_HDR_LOADED", sizeof(str)); + count++; + } + + if (flags & VOL_HDR_IN_LRU) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_HDR_IN_LRU", sizeof(str)); + count++; + } + + if (flags & VOL_IN_HASH) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_IN_HASH", sizeof(str)); + count++; + } + + if (flags & VOL_ON_VBYP_LIST) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_ON_VBYP_LIST", sizeof(str)); + count++; + } + + if (flags & VOL_IS_BUSY) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_IS_BUSY", sizeof(str)); + count++; + } + + if (flags & VOL_ON_VLRU) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_ON_VLRU", sizeof(str)); + } + + if (flags & VOL_HDR_DONTSALV) { + if (count) { + strlcat(str, " | ", sizeof(str)); + } + strlcat(str, "VOL_HDR_DONTSALV", sizeof(str)); + } + + return str; +} + +static char * +vlru_idx_to_string(int idx) +{ + switch (idx) { + case VLRU_QUEUE_NEW: + return "VLRU_QUEUE_NEW"; + case VLRU_QUEUE_MID: + return "VLRU_QUEUE_MID"; + case VLRU_QUEUE_OLD: + return "VLRU_QUEUE_OLD"; + case VLRU_QUEUE_CANDIDATE: + return "VLRU_QUEUE_CANDIDATE"; + case VLRU_QUEUE_HELD: + return "VLRU_QUEUE_HELD"; + case VLRU_QUEUE_INVALID: + return "VLRU_QUEUE_INVALID"; + default: + return "**UNKNOWN**"; + } +} +#endif + +static int +VolQuery(struct cmd_syndesc * as, char * rock) +{ + struct state state; + SYNC_PROTO_BUF_DECL(res_buf); + SYNC_response res; + Volume v; + int hi, lo; + + res.hdr.response_len = sizeof(res.hdr); + res.payload.buf = res_buf; + res.payload.len = SYNC_PROTO_MAX_LEN; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_QUERY, &res); + + if (res.hdr.response == SYNC_OK) { + memcpy(&v, res.payload.buf, sizeof(Volume)); + + printf("volume = {\n"); + printf("\thashid = %u\n", v.hashid); + printf("\theader = 0x%x\n", v.header); + printf("\tdevice = %d\n", v.device); + printf("\tpartition = 0x%x\n", v.partition); + printf("\tlinkHandle = 0x%x\n", v.linkHandle); + printf("\tnextVnodeUnique = %u\n", v.nextVnodeUnique); + printf("\tdiskDataHandle = 0x%x\n", v.diskDataHandle); + printf("\tvnodeHashOffset = %u\n", v.vnodeHashOffset); + printf("\tshuttingDown = %d\n", v.shuttingDown); + printf("\tgoingOffline = %d\n", v.goingOffline); + printf("\tcacheCheck = %u\n", v.cacheCheck); + printf("\tnUsers = %d\n", v.nUsers); + printf("\tneedsPutBack = %d\n", v.needsPutBack); + printf("\tspecialStatus = %d\n", v.specialStatus); + printf("\tupdateTime = %u\n", v.updateTime); + + printf("\tvnodeIndex[vSmall] = {\n"); + printf("\t\thandle = 0x%x\n", v.vnodeIndex[vSmall].handle); + printf("\t\tbitmap = 0x%x\n", v.vnodeIndex[vSmall].bitmap); + printf("\t\tbitmapSize = %u\n", v.vnodeIndex[vSmall].bitmapSize); + printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vSmall].bitmapOffset); + printf("\t}\n"); + printf("\tvnodeIndex[vLarge] = {\n"); + printf("\t\thandle = 0x%x\n", v.vnodeIndex[vLarge].handle); + printf("\t\tbitmap = 0x%x\n", v.vnodeIndex[vLarge].bitmap); + printf("\t\tbitmapSize = %u\n", v.vnodeIndex[vLarge].bitmapSize); + printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vLarge].bitmapOffset); + printf("\t}\n"); +#ifdef AFS_DEMAND_ATTACH_FS + if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) { + printf("\tupdateTime = %u\n", v.updateTime); + printf("\tattach_state = %s\n", vol_state_to_string(v.attach_state)); + printf("\tattach_flags = %s\n", vol_flags_to_string(v.attach_flags)); + printf("\tnWaiters = %d\n", v.nWaiters); + printf("\tchainCacheCheck = %d\n", v.chainCacheCheck); + + /* online salvage structure */ + printf("\tsalvage = {\n"); + printf("\t\tprio = %u\n", v.salvage.prio); + printf("\t\treason = %d\n", v.salvage.reason); + printf("\t\trequested = %d\n", v.salvage.requested); + printf("\t\tscheduled = %d\n", v.salvage.scheduled); + printf("\t}\n"); + + /* statistics structure */ + printf("\tstats = {\n"); + + printf("\t\thash_lookups = {\n"); + SplitInt64(v.stats.hash_lookups,hi,lo); + printf("\t\t\thi = %u\n", hi); + printf("\t\t\tlo = %u\n", lo); + printf("\t\t}\n"); + + printf("\t\thash_short_circuits = {\n"); + SplitInt64(v.stats.hash_short_circuits,hi,lo); + printf("\t\t\thi = %u\n", hi); + printf("\t\t\tlo = %u\n", lo); + printf("\t\t}\n"); + + printf("\t\thdr_loads = {\n"); + SplitInt64(v.stats.hdr_loads,hi,lo); + printf("\t\t\thi = %u\n", hi); + printf("\t\t\tlo = %u\n", lo); + printf("\t\t}\n"); + + printf("\t\thdr_gets = {\n"); + SplitInt64(v.stats.hdr_gets,hi,lo); + printf("\t\t\thi = %u\n", hi); + printf("\t\t\tlo = %u\n", lo); + printf("\t\t}\n"); + + printf("\t\tattaches = %u\n", v.stats.attaches); + printf("\t\tsoft_detaches = %u\n", v.stats.soft_detaches); + printf("\t\tsalvages = %u\n", v.stats.salvages); + printf("\t\tvol_ops = %u\n", v.stats.vol_ops); + + printf("\t\tlast_attach = %u\n", v.stats.last_attach); + printf("\t\tlast_get = %u\n", v.stats.last_get); + printf("\t\tlast_promote = %u\n", v.stats.last_promote); + printf("\t\tlast_hdr_get = %u\n", v.stats.last_hdr_get); + printf("\t\tlast_salvage = %u\n", v.stats.last_salvage); + printf("\t\tlast_salvage_req = %u\n", v.stats.last_salvage_req); + printf("\t\tlast_vol_op = %u\n", v.stats.last_vol_op); + printf("\t}\n"); + + /* VLRU state */ + printf("\tvlru = {\n"); + printf("\t\tidx = %d (%s)\n", + v.vlru.idx, vlru_idx_to_string(v.vlru.idx)); + printf("\t}\n"); + + /* volume op state */ + printf("\tpending_vol_op = 0x%x\n", v.pending_vol_op); + } +#else /* !AFS_DEMAND_ATTACH_FS */ + if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) { + printf("*** server asserted demand attach extensions. fssync-debug not built to\n"); + printf("*** recognize those extensions. please recompile fssync-debug if you need\n"); + printf("*** to dump dafs extended state\n"); + } +#endif /* !AFS_DEMAND_ATTACH_FS */ + printf("}\n"); + } + + return 0; +} + +static int +VolHdrQuery(struct cmd_syndesc * as, char * rock) +{ + struct state state; + SYNC_PROTO_BUF_DECL(res_buf); + SYNC_response res; + VolumeDiskData v; + int i; + + res.hdr.response_len = sizeof(res.hdr); + res.payload.buf = res_buf; + res.payload.len = SYNC_PROTO_MAX_LEN; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_QUERY_HDR, &res); + + if (res.hdr.response == SYNC_OK) { + memcpy(&v, res.payload.buf, sizeof(VolumeDiskData)); + + printf("VolumeDiskData = {\n"); + printf("\tstamp = {\n"); + printf("\t\tmagic = 0x%x\n", v.stamp.magic); + printf("\t\tversion = %u\n", v.stamp.version); + printf("\t}\n"); + + printf("\tid = %u\n", v.id); + printf("\tname = '%s'\n", v.name); + printf("\tinUse = %d\n", v.inUse); + printf("\tinService = %d\n", v.inService); + printf("\tblessed = %d\n", v.blessed); + printf("\tneedsSalvaged = %d\n", v.needsSalvaged); + printf("\tuniquifier = %u\n", v.uniquifier); + printf("\ttype = %d\n", v.type); + printf("\tparentId = %u\n", v.parentId); + printf("\tcloneId = %u\n", v.cloneId); + printf("\tbackupId = %u\n", v.backupId); + printf("\trestoredFromId = %u\n", v.restoredFromId); + printf("\tneedsCallback = %d\n", v.needsCallback); + printf("\tdestroyMe = %d\n", v.destroyMe); + printf("\tdontSalvage = %d\n", v.dontSalvage); + printf("\tmaxquota = %d\n", v.maxquota); + printf("\tminquota = %d\n", v.minquota); + printf("\tmaxfiles = %d\n", v.maxfiles); + printf("\taccountNumber = %u\n", v.accountNumber); + printf("\towner = %u\n", v.owner); + printf("\tfilecount = %d\n", v.filecount); + printf("\tdiskused = %d\n", v.diskused); + printf("\tdayUse = %d\n", v.dayUse); + for (i = 0; i < 7; i++) { + printf("\tweekUse[%d] = %d\n", i, v.weekUse[i]); + } + printf("\tdayUseDate = %u\n", v.dayUseDate); + printf("\tcreationDate = %u\n", v.creationDate); + printf("\taccessDate = %u\n", v.accessDate); + printf("\tupdateDate = %u\n", v.updateDate); + printf("\texpirationDate = %u\n", v.expirationDate); + printf("\tbackupDate = %u\n", v.backupDate); + printf("\tcopyDate = %u\n", v.copyDate); +#ifdef OPENAFS_VOL_STATS + printf("\tstat_initialized = %d\n", v.stat_initialized); +#else + printf("\tmtd = '%s'\n", v.motd); +#endif + printf("}\n"); + } + + return 0; +} + +static int +VolOpQuery(struct cmd_syndesc * as, char * rock) +{ + struct state state; + SYNC_PROTO_BUF_DECL(res_buf); + SYNC_response res; + FSSYNC_VolOp_info vop; + int i; + + res.hdr.response_len = sizeof(res.hdr); + res.payload.buf = res_buf; + res.payload.len = SYNC_PROTO_MAX_LEN; + + common_prolog(as, &state); + common_volop_prolog(as, &state); + + do_volop(&state, FSYNC_VOL_QUERY_VOP, &res); + + if (!(res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS)) { + printf("*** file server not compiled with demand attach extensions.\n"); + printf("*** pending volume operation metadata not available.\n"); + } + + if (res.hdr.response == SYNC_OK) { + memcpy(&vop, res.payload.buf, sizeof(FSSYNC_VolOp_info)); + + printf("pending_vol_op = {\n"); + + printf("\tcom = {\n"); + printf("\t\tproto_version = %u\n", vop.com.proto_version); + printf("\t\tprogramType = %d (%s)\n", + vop.com.programType, program_type_to_string(vop.com.programType)); + printf("\t\tcommand = %d (%s)\n", + vop.com.command, command_code_to_string(vop.com.command)); + printf("\t\treason = %d (%s)\n", + vop.com.reason, reason_code_to_string(vop.com.reason)); + printf("\t\tcommand_len = %u\n", vop.com.command_len); + printf("\t\tflags = 0x%x\n", vop.com.flags); + printf("\t}\n"); + + printf("\tvop = {\n"); + printf("\t\tvolume = %u\n", vop.vop.volume); + if (afs_strnlen(vop.vop.partName, sizeof(vop.vop.partName)) < + sizeof(vop.vop.partName)) { + printf("\t\tpartName = '%s'\n", vop.vop.partName); + } else { + printf("\t\tpartName = (illegal string)\n"); + } + printf("\t}\n"); + + printf("}\n"); + } + + return 0; +} + +static int +StatsQuery(struct cmd_syndesc * as, char * rock) +{ + afs_int32 code; + int command; + struct cmd_item *ti; + struct state state; + SYNC_PROTO_BUF_DECL(res_buf); + SYNC_response res; + FSSYNC_StatsOp_hdr scom; + union { + void * ptr; + struct VolPkgStats * vol_stats; + struct VolumeHashChainStats * hash_stats; +#ifdef AFS_DEMAND_ATTACH_FS + struct volume_hdr_LRU_stats * hdr_stats; +#endif + struct DiskPartitionStats * vicep_stats; + } sres; + + sres.ptr = res_buf; + res.hdr.response_len = sizeof(res.hdr); + res.payload.buf = res_buf; + res.payload.len = SYNC_PROTO_MAX_LEN; + + if ((ti = as->parms[CUSTOM_PARMS_OFFSET].items)) { /* -subcommand */ + if (!strcasecmp(ti->data, "vicep")) { + command = FSYNC_VOL_STATS_VICEP; + } else if (!strcasecmp(ti->data, "hash")) { + command = FSYNC_VOL_STATS_HASH; +#ifdef AFS_DEMAND_ATTACH_FS + } else if (!strcasecmp(ti->data, "hdr")) { + command = FSYNC_VOL_STATS_HDR; + } else if (!strcasecmp(ti->data, "vlru")) { + command = FSYNC_VOL_STATS_VLRU; +#endif + } else if (!strcasecmp(ti->data, "pkg")) { + command = FSYNC_VOL_STATS_GENERAL; + } else if (!strcasecmp(ti->data, "help")) { + fprintf(stderr, "fssync-debug stats subcommands:\n"); + fprintf(stderr, "\tpkg\tgeneral volume package stats\n"); + fprintf(stderr, "\tvicep\tvice partition stats\n"); + fprintf(stderr, "\thash\tvolume hash chain stats\n"); +#ifdef AFS_DEMAND_ATTACH_FS + fprintf(stderr, "\thdr\tvolume header cache stats\n"); + fprintf(stderr, "\tvlru\tvlru generation stats\n"); +#endif + exit(0); + } else { + fprintf(stderr, "invalid stats subcommand"); + exit(1); + } + } else { + command = FSYNC_VOL_STATS_GENERAL; + } + + if ((ti = as->parms[CUSTOM_PARMS_OFFSET+1].items)) { /* -arg1 */ + switch (command) { + case FSYNC_VOL_STATS_VICEP: + strlcpy(scom.args.partName, ti->data, sizeof(state.vop->partName)); + break; + case FSYNC_VOL_STATS_HASH: + scom.args.hash_bucket = atoi(ti->data); + break; + case FSYNC_VOL_STATS_VLRU: + scom.args.vlru_generation = atoi(ti->data); + break; + default: + fprintf(stderr, "unrecognized arguments\n"); + exit(1); + } + } else { + switch (command) { + case FSYNC_VOL_STATS_VICEP: + case FSYNC_VOL_STATS_HASH: + case FSYNC_VOL_STATS_VLRU: + fprintf(stderr, "this subcommand requires more parameters\n"); + exit(1); + } + } + + common_prolog(as, &state); + + fprintf(stderr, "calling FSYNC_askfs with command code %d (%s)\n", + command, command_code_to_string(command)); + + code = FSYNC_StatsOp(&scom, command, FSYNC_WHATEVER, &res); + + switch (code) { + case SYNC_OK: + case SYNC_DENIED: + break; + default: + fprintf(stderr, "possible sync protocol error. return code was %d\n", code); + } + + fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code)); + fprintf(stderr, "protocol response code was %d (%s)\n", + res.hdr.response, response_code_to_string(res.hdr.response)); + fprintf(stderr, "protocol reason code was %d (%s)\n", + res.hdr.reason, reason_code_to_string(res.hdr.reason)); + + VDisconnectFS(); + + if (res.hdr.response == SYNC_OK) { + switch (command) { + case FSYNC_VOL_STATS_GENERAL: + print_vol_stats_general(sres.vol_stats); + break; + case FSYNC_VOL_STATS_VICEP: + print_vol_stats_viceP(sres.vicep_stats); + break; + case FSYNC_VOL_STATS_HASH: + print_vol_stats_hash(sres.hash_stats); + break; +#ifdef AFS_DEMAND_ATTACH_FS + case FSYNC_VOL_STATS_HDR: + print_vol_stats_hdr(sres.hdr_stats); + break; +#endif /* AFS_DEMAND_ATTACH_FS */ + } + } + + return 0; +} + +static void +print_vol_stats_general(VolPkgStats * stats) +{ + int i; + afs_uint32 hi, lo; + + printf("VolPkgStats = {\n"); +#ifdef AFS_DEMAND_ATTACH_FS + for (i = 0; i < VOL_STATE_COUNT; i++) { + printf("\tvol_state_count[%s] = %d\n", + vol_state_to_string(i), + stats->state_levels[i]); + } + + SplitInt64(stats->hash_looks, hi, lo); + printf("\thash_looks = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->hash_reorders, hi, lo); + printf("\thash_reorders = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->salvages, hi, lo); + printf("\tsalvages = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->vol_ops, hi, lo); + printf("\tvol_ops = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); +#endif + SplitInt64(stats->hdr_loads, hi, lo); + printf("\thdr_loads = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->hdr_gets, hi, lo); + printf("\thdr_gets = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->attaches, hi, lo); + printf("\tattaches = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->soft_detaches, hi, lo); + printf("\tsoft_detaches = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + printf("\thdr_cache_size = %d\n", stats->hdr_cache_size); + + printf("}\n"); +} + +static void +print_vol_stats_viceP(struct DiskPartitionStats * stats) +{ + printf("DiskPartitionStats = {\n"); + printf("\tfree = %d\n", stats->free); + printf("\tminFree = %d\n", stats->minFree); + printf("\ttotalUsable = %d\n", stats->totalUsable); + printf("\tf_files = %d\n", stats->f_files); +#ifdef AFS_DEMAND_ATTACH_FS + printf("\tvol_list_len = %d\n", stats->vol_list_len); +#endif + printf("}\n"); +} + +static void +print_vol_stats_hash(struct VolumeHashChainStats * stats) +{ + afs_uint32 hi, lo; + + printf("DiskPartitionStats = {\n"); + printf("\ttable_size = %d\n", stats->table_size); + printf("\tchain_len = %d\n", stats->chain_len); + +#ifdef AFS_DEMAND_ATTACH_FS + printf("\tchain_cacheCheck = %d\n", stats->chain_cacheCheck); + printf("\tchain_busy = %d\n", stats->chain_busy); + + SplitInt64(stats->chain_looks, hi, lo); + printf("\tchain_looks = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->chain_gets, hi, lo); + printf("\tchain_gets = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); + + SplitInt64(stats->chain_reorders, hi, lo); + printf("\tchain_reorders = {\n"); + printf("\t\thi = %u\n", hi); + printf("\t\tlo = %u\n", lo); + printf("\t}\n"); +#endif /* AFS_DEMAND_ATTACH_FS */ + + printf("}\n"); +} + + +#ifdef AFS_DEMAND_ATTACH_FS +static void +print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats) +{ + printf("volume_hdr_LRU_stats = {\n"); + printf("\tfree = %d\n", stats->free); + printf("\tused = %d\n", stats->used); + printf("\tattached = %d\n", stats->attached); + printf("}\n"); +} +#endif /* AFS_DEMAND_ATTACH_FS */ + diff --git a/src/vol/fssync-server.c b/src/vol/fssync-server.c new file mode 100644 index 0000000000..44494ca739 --- /dev/null +++ b/src/vol/fssync-server.c @@ -0,0 +1,1179 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates + */ + +/* + System: VICE-TWO + Module: fssync.c + Institution: The Information Technology Center, Carnegie-Mellon University + + */ +#ifdef notdef + +/* All this is going away in early 1989 */ +int newVLDB; /* Compatibility flag */ + +#endif +static int newVLDB = 1; + + +#ifndef AFS_PTHREAD_ENV +#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2) + +/* + * stack size increased from 8K because the HP machine seemed to have trouble + * with the smaller stack + */ +#define USUAL_STACK_SIZE (24 * 1024) +#endif /* !AFS_PTHREAD_ENV */ + +/* + fssync-server.c + File server synchronization with external volume utilities. + server-side implementation + */ + +/* This controls the size of an fd_set; it must be defined early before + * the system headers define that type and the macros that operate on it. + * Its value should be as large as the maximum file descriptor limit we + * are likely to run into on any platform. Right now, that is 65536 + * which is the default hard fd limit on Solaris 9 */ +#ifndef _WIN32 +#define FD_SETSIZE 65536 +#endif + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#ifdef AFS_PTHREAD_ENV +#include +#else /* AFS_PTHREAD_ENV */ +#include +#endif /* AFS_PTHREAD_ENV */ +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + + +#include +#include +#include "nfs.h" +#include +#include "daemon_com.h" +#include "fssync.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" + + +#ifdef FSSYNC_BUILD_SERVER + +/*@printflike@*/ extern void Log(const char *format, ...); + +#ifdef osi_Assert +#undef osi_Assert +#endif +#define osi_Assert(e) (void)(e) + +int (*V_BreakVolumeCallbacks) (); + +#define MAXHANDLERS 4 /* Up to 4 clients; must be at least 2, so that + * move = dump+restore can run on single server */ +#define MAXOFFLINEVOLUMES 128 /* This needs to be as big as the maximum + * number that would be offline for 1 operation. + * Current winner is salvage, which needs all + * cloned read-only copies offline when salvaging + * a single read-write volume */ + +#define MAX_BIND_TRIES 5 /* Number of times to retry socket bind */ + + + +static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES]; + +static int AcceptSd = -1; /* Socket used by server for accepting connections */ + +static int getport(); + +/* Forward declarations */ +static void FSYNC_sync(); +static void FSYNC_newconnection(); +static void FSYNC_com(); +static void FSYNC_Drop(); +static void AcceptOn(); +static void AcceptOff(); +static void InitHandler(); +static void CallHandler(fd_set * fdsetp); +static int AddHandler(); +static int FindHandler(); +static int FindHandler_r(); +static int RemoveHandler(); +static void GetHandler(fd_set * fdsetp, int *maxfdp); + +extern int LogLevel; + +static afs_int32 FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res); + +static afs_int32 FSYNC_com_VolOn(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolOff(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolMove(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolDone(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolQuery(FSSYNC_VolOp_command * com, SYNC_response * res); +static afs_int32 FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * com, SYNC_response * res); +#ifdef AFS_DEMAND_ATTACH_FS +static afs_int32 FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * com, SYNC_response * res); +#endif /* AFS_DEMAND_ATTACH_FS */ + +static afs_int32 FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res); + +static afs_int32 FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res); +static afs_int32 FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res); +static afs_int32 FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res); +static afs_int32 FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res); +static afs_int32 FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res); + + +static void FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info); + + +/* + * This lock controls access to the handler array. The overhead + * is minimal in non-preemptive environments. + */ +struct Lock FSYNC_handler_lock; + +void +FSYNC_fsInit(void) +{ +#ifdef AFS_PTHREAD_ENV + pthread_t tid; + pthread_attr_t tattr; +#else /* AFS_PTHREAD_ENV */ + PROCESS pid; +#endif /* AFS_PTHREAD_ENV */ + + Lock_Init(&FSYNC_handler_lock); + +#ifdef AFS_PTHREAD_ENV + assert(pthread_attr_init(&tattr) == 0); + assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0); + assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0); +#else /* AFS_PTHREAD_ENV */ + assert(LWP_CreateProcess + (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0, + "FSYNC_sync", &pid) == LWP_SUCCESS); +#endif /* AFS_PTHREAD_ENV */ +} + +static fd_set FSYNC_readfds; + +static int +getport(struct sockaddr_in *addr) +{ + int sd; + + memset(addr, 0, sizeof(*addr)); + assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); +#ifdef STRUCT_SOCKADDR_HAS_SA_LEN + addr->sin_len = sizeof(struct sockaddr_in); +#endif + addr->sin_addr.s_addr = htonl(0x7f000001); + addr->sin_family = AF_INET; /* was localhost->h_addrtype */ + addr->sin_port = htons(2040); /* XXXX htons not _really_ neccessary */ + + return sd; +} + + +static void +FSYNC_sync() +{ + struct sockaddr_in addr; + int on = 1; + extern int VInit; + int code; + int numTries; +#ifdef AFS_PTHREAD_ENV + int tid; +#endif + +#ifndef AFS_NT40_ENV + (void)signal(SIGPIPE, SIG_IGN); +#endif + +#ifdef AFS_PTHREAD_ENV + /* set our 'thread-id' so that the host hold table works */ + MUTEX_ENTER(&rx_stats_mutex); /* protects rxi_pthread_hinum */ + tid = ++rxi_pthread_hinum; + MUTEX_EXIT(&rx_stats_mutex); + pthread_setspecific(rx_thread_id_key, (void *)tid); + Log("Set thread id %d for FSYNC_sync\n", tid); +#endif /* AFS_PTHREAD_ENV */ + + while (!VInit) { + /* Let somebody else run until level > 0. That doesn't mean that + * all volumes have been attached. */ +#ifdef AFS_PTHREAD_ENV + pthread_yield(); +#else /* AFS_PTHREAD_ENV */ + LWP_DispatchProcess(); +#endif /* AFS_PTHREAD_ENV */ + } + AcceptSd = getport(&addr); + /* Reuseaddr needed because system inexplicably leaves crud lying around */ + code = + setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on, + sizeof(on)); + if (code) + Log("FSYNC_sync: setsockopt failed with (%d)\n", errno); + + for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) { + if ((code = + bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0) + break; + Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n", + errno); + sleep(5); + } + assert(!code); + listen(AcceptSd, 100); + InitHandler(); + AcceptOn(); + for (;;) { + int maxfd; + GetHandler(&FSYNC_readfds, &maxfd); + /* Note: check for >= 1 below is essential since IOMGR_select + * doesn't have exactly same semantics as select. + */ +#ifdef AFS_PTHREAD_ENV + if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1) +#else /* AFS_PTHREAD_ENV */ + if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1) +#endif /* AFS_PTHREAD_ENV */ + CallHandler(&FSYNC_readfds); + } +} + +static void +FSYNC_newconnection(int afd) +{ + struct sockaddr_in other; + int junk, fd; + junk = sizeof(other); + fd = accept(afd, (struct sockaddr *)&other, &junk); + if (fd == -1) { + Log("FSYNC_newconnection: accept failed, errno==%d\n", errno); + assert(1 == 2); + } else if (!AddHandler(fd, FSYNC_com)) { + AcceptOff(); + assert(AddHandler(fd, FSYNC_com)); + } +} + +/* this function processes commands from an fssync file descriptor (fd) */ +afs_int32 FS_cnt = 0; +static void +FSYNC_com(int fd) +{ + SYNC_command com; + SYNC_response res; + SYNC_PROTO_BUF_DECL(com_buf); + SYNC_PROTO_BUF_DECL(res_buf); + + memset(&res.hdr, 0, sizeof(res.hdr)); + + com.payload.buf = (void *)com_buf; + com.payload.len = SYNC_PROTO_MAX_LEN; + res.hdr.response_len = sizeof(res.hdr); + res.hdr.proto_version = FSYNC_PROTO_VERSION; + res.payload.len = SYNC_PROTO_MAX_LEN; + res.payload.buf = (void *)res_buf; + + FS_cnt++; + if (SYNC_getCom(fd, &com)) { + Log("FSYNC_com: read failed; dropping connection (cnt=%d)\n", FS_cnt); + FSYNC_Drop(fd); + return; + } + + if (com.hdr.proto_version != FSYNC_PROTO_VERSION) { + Log("FSYNC_com: invalid protocol version (%u)\n", com.hdr.proto_version); + res.hdr.response = SYNC_COM_ERROR; + res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + goto respond; + } + + VOL_LOCK; + switch (com.hdr.command) { + case FSYNC_VOL_ON: + case FSYNC_VOL_OFF: + case FSYNC_VOL_LISTVOLUMES: + case FSYNC_VOL_NEEDVOLUME: + case FSYNC_VOL_MOVE: + case FSYNC_VOL_BREAKCBKS: + case FSYNC_VOL_DONE: + case FSYNC_VOL_QUERY: + case FSYNC_VOL_QUERY_HDR: + case FSYNC_VOL_QUERY_VOP: + res.hdr.response = FSYNC_com_VolOp(fd, &com, &res); + break; + case FSYNC_VOL_STATS_GENERAL: + case FSYNC_VOL_STATS_VICEP: + case FSYNC_VOL_STATS_HASH: + case FSYNC_VOL_STATS_HDR: + case FSYNC_VOL_STATS_VLRU: + res.hdr.response = FSYNC_com_StatsOp(fd, &com, &res); + break; + case SYNC_COM_CHANNEL_CLOSE: + res.hdr.response = SYNC_OK; + res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + break; + default: + res.hdr.response = SYNC_BAD_COMMAND; + break; + } + VOL_UNLOCK; + + respond: + SYNC_putRes(fd, &res); + if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) { + FSYNC_Drop(fd); + } +} + +static afs_int32 +FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res) +{ + int i; + afs_int32 code = SYNC_OK; + FSSYNC_VolOp_command vcom; + + if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_VolOp_hdr))) { + res->hdr.reason = SYNC_REASON_MALFORMED_PACKET; + res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + return SYNC_COM_ERROR; + } + + vcom.hdr = &com->hdr; + vcom.vop = (FSSYNC_VolOp_hdr *) com->payload.buf; + vcom.com = com; + + vcom.volumes = OfflineVolumes[FindHandler(fd)]; + for (vcom.v = NULL, i = 0; i < MAXOFFLINEVOLUMES; i++) { + if ((vcom.volumes[i].volumeID == vcom.vop->volume) && + (strncmp(vcom.volumes[i].partName, vcom.vop->partName, + sizeof(vcom.volumes[i].partName)) == 0)) { + vcom.v = &vcom.volumes[i]; + break; + } + } + + switch (com->hdr.command) { + case FSYNC_VOL_ON: + code = FSYNC_com_VolOn(&vcom, res); + break; + case FSYNC_VOL_OFF: + case FSYNC_VOL_NEEDVOLUME: + code = FSYNC_com_VolOff(&vcom, res); + break; + case FSYNC_VOL_LISTVOLUMES: + code = SYNC_OK; + break; + case FSYNC_VOL_MOVE: + code = FSYNC_com_VolMove(&vcom, res); + break; + case FSYNC_VOL_BREAKCBKS: + code = FSYNC_com_VolBreakCBKs(&vcom, res); + break; + case FSYNC_VOL_DONE: + code = FSYNC_com_VolDone(&vcom, res); + break; + case FSYNC_VOL_QUERY: + code = FSYNC_com_VolQuery(&vcom, res); + break; + case FSYNC_VOL_QUERY_HDR: + code = FSYNC_com_VolHdrQuery(&vcom, res); + break; +#ifdef AFS_DEMAND_ATTACH_FS + case FSYNC_VOL_QUERY_VOP: + code = FSYNC_com_VolOpQuery(&vcom, res); + break; +#endif /* AFS_DEMAND_ATTACH_FS */ + default: + code = SYNC_BAD_COMMAND; + } + + return code; +} + +static afs_int32 +FSYNC_com_VolOn(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + char tvolName[VMAXPATHLEN]; + Volume * vp; + Error error; + + if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) { + res->hdr.reason = SYNC_REASON_MALFORMED_PACKET; + code = SYNC_FAILED; + goto done; + } + + /* + This is where a detatched volume gets reattached. However in the + special case where the volume is merely busy, it is already + attatched and it is only necessary to clear the busy flag. See + defect #2080 for details. + */ + + /* is the volume already attatched? */ +#ifdef notdef + /* + * XXX With the following enabled we had bizarre problems where the backup id would + * be reset to 0; that was due to the interaction between fileserver/volserver in that they + * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of + * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call + * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to + * be done right XXX + */ + vp = VGetVolume_r(&error, vcom->vop->volume); + if (vp) { + /* yep, is the BUSY flag set? */ + if (vp->specialStatus == VBUSY) { + + /* yep, clear BUSY flag */ + + vp->specialStatus = 0; + /* make sure vol is online */ + if (vcom->v) { + vcom->v->volumeID = 0; + V_inUse(vp) = 1; /* online */ + } + VPutVolume_r(vp); + break; + } + VPutVolume_r(vp); + } +#endif /* notdef */ + + /* so, we need to attach the volume */ + + if (vcom->v) + vcom->v->volumeID = 0; + tvolName[0] = '/'; + snprintf(&tvolName[1], sizeof(tvolName)-1, VFORMAT, vcom->vop->volume); + tvolName[sizeof(tvolName)-1] = '\0'; + +#ifdef AFS_DEMAND_ATTACH_FS + vp = VPreAttachVolumeByName_r(&error, vcom->vop->partName, tvolName, + V_VOLUPD); + if (vp && vp->pending_vol_op) { + VDeregisterVolOp_r(vp, vp->pending_vol_op); + } +#else /* AFS_DEMAND_ATTACH_FS */ + vp = VAttachVolumeByName_r(&error, vcom->vop->partName, tvolName, + V_VOLUPD); + if (vp) + VPutVolume_r(vp); +#endif /* AFS_DEMAND_ATTACH_FS */ + + if (error) { + code = SYNC_DENIED; + res->hdr.reason = error; + } + + done: + return code; +} + +static afs_int32 +FSYNC_com_VolOff(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + FSSYNC_VolOp_info info; + afs_int32 code = SYNC_OK; + int i; + Volume * vp, * nvp; + Error error; + + if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) { + res->hdr.reason = SYNC_REASON_MALFORMED_PACKET; + code = SYNC_FAILED; + goto done; + } + + /* not already offline, we need to find a slot for newly offline volume */ + if (vcom->hdr->programType == debugUtility) { + /* debug utilities do not have their operations tracked */ + vcom->v = NULL; + } else { + if (!vcom->v) { + for (i = 0; i < MAXOFFLINEVOLUMES; i++) { + if (vcom->volumes[i].volumeID == 0) { + vcom->v = &vcom->volumes[i]; + break; + } + } + } + if (!vcom->v) { + goto deny; + } + } + + FSYNC_com_to_info(vcom, &info); + +#ifdef AFS_DEMAND_ATTACH_FS + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); +#else + vp = VGetVolume_r(&error, vcom->vop->volume); +#endif + + if (vp) { + if ((vcom->vop->partName[0] != 0) && + (strncmp(vcom->vop->partName, vp->partition->name, + sizeof(vcom->vop->partName)) != 0)) { + /* volume on desired partition is not online, so we + * should treat this as an offline volume. + */ +#ifndef AFS_DEMAND_ATTACH_FS + VPutVolume_r(vp); +#endif + vp = NULL; + goto done; + } + } + +#ifdef AFS_DEMAND_ATTACH_FS + if (vp) { + ProgramType type = (ProgramType) vcom->hdr->programType; + + /* do initial filtering of requests */ + + /* enforce mutual exclusion for volume ops */ + if (vp->pending_vol_op) { + if (vp->pending_vol_op->com.programType != type) { + Log("volume %u already checked out\n", vp->hashid); + /* XXX debug */ + Log("vp->vop = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x }, vop = { vol=%u, part='%s' } }\n", + vp->pending_vol_op->com.proto_version, + vp->pending_vol_op->com.programType, + vp->pending_vol_op->com.command, + vp->pending_vol_op->com.reason, + vp->pending_vol_op->com.command_len, + vp->pending_vol_op->com.flags, + vp->pending_vol_op->vop.volume, + vp->pending_vol_op->vop.partName ); + Log("vcom = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x } , vop = { vol=%u, part='%s' } }\n", + vcom->hdr->proto_version, + vcom->hdr->programType, + vcom->hdr->command, + vcom->hdr->reason, + vcom->hdr->command_len, + vcom->hdr->flags, + vcom->vop->volume, + vcom->vop->partName); + res->hdr.reason = FSYNC_EXCLUSIVE; + goto deny; + } else { + Log("warning: volume %u recursively checked out by programType id %d\n", + vp->hashid, vcom->hdr->programType); + } + } + + /* filter based upon requestor + * + * volume utilities are not allowed to check out volumes + * which are in an error state + * + * unknown utility programs will be denied on principal + */ + switch (type) { + case salvageServer: + case debugUtility: + /* give the salvageserver lots of liberty */ + break; + case volumeUtility: + if ((V_attachState(vp) == VOL_STATE_ERROR) || + (V_attachState(vp) == VOL_STATE_SALVAGING)) { + goto deny; + } + break; + default: + Log("bad program type passed to FSSYNC\n"); + goto deny; + } + + /* short circuit for offline volume states + * so we can avoid I/O penalty of attachment */ + switch (V_attachState(vp)) { + case VOL_STATE_UNATTACHED: + case VOL_STATE_PREATTACHED: + case VOL_STATE_SALVAGING: + case VOL_STATE_ERROR: + /* register the volume operation metadata with the volume + * + * if the volume is currently pre-attached, attach2() + * will evaluate the vol op metadata to determine whether + * attaching the volume would be safe */ + VRegisterVolOp_r(vp, &info); + goto done; + default: + break; + } + + /* convert to heavyweight ref */ + nvp = VGetVolumeByVp_r(&error, vp); + + /* register the volume operation metadata with the volume */ + VRegisterVolOp_r(vp, &info); + + if (!nvp) { + Log("FSYNC_com_VolOff: failed to get heavyweight reference to volume %u\n", + vcom->vop->volume); + res->hdr.reason = FSYNC_VOL_PKG_ERROR; + goto deny; + } + vp = nvp; + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + if (vp) { + if (VVolOpLeaveOnline_r(vp, &info)) { + VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT); /* At least get volume stats right */ + if (LogLevel) { + Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", + V_id(vp), V_name(vp), + vcom->hdr->reason == V_CLONE ? "clone" : + vcom->hdr->reason == V_READONLY ? "readonly" : + vcom->hdr->reason == V_DUMP ? "dump" : + "UNKNOWN"); + } + VPutVolume_r(vp); + } else { + if (VVolOpSetVBusy_r(vp, &info)) { + vp->specialStatus = VBUSY; + } + + /* remember what volume we got, so we can keep track of how + * many volumes the volserver or whatever is using. Note that + * vp is valid since leaveonline is only set when vp is valid. + */ + if (vcom->v) { + vcom->v->volumeID = vcom->vop->volume; + strlcpy(vcom->v->partName, vp->partition->name, sizeof(vcom->v->partName)); + } + + VOffline_r(vp, "A volume utility is running."); + vp = NULL; + } + } + + done: + return code; + + deny: + return SYNC_DENIED; +} + +static afs_int32 +FSYNC_com_VolMove(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + Error error; + Volume * vp; + + /* Yuch: the "reason" for the move is the site it got moved to... */ + /* still set specialStatus so we stop sending back VBUSY. + * also should still break callbacks. Note that I don't know + * how to tell if we should break all or not, so we just do it + * since it doesn't matter much if we do an extra break + * volume callbacks on a volume move within the same server */ +#ifdef AFS_DEMAND_ATTACH_FS + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); +#else + vp = VGetVolume_r(&error, vcom->vop->volume); +#endif + if (vp) { + vp->specialStatus = VMOVED; +#ifndef AFS_DEMAND_ATTACH_FS + VPutVolume_r(vp); +#endif + } + + if (V_BreakVolumeCallbacks) { + Log("fssync: volume %u moved to %x; breaking all call backs\n", + vcom->vop->volume, vcom->hdr->reason); + VOL_UNLOCK; + (*V_BreakVolumeCallbacks) (vcom->vop->volume); + VOL_LOCK; + } + + return SYNC_OK; +} + +static afs_int32 +FSYNC_com_VolDone(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ +#ifdef AFS_DEMAND_ATTACH_FS + Error error; + Volume * vp; +#endif + + /* don't try to put online, this call is made only after deleting + * a volume, in which case we want to remove the vol # from the + * OfflineVolumes array only */ + if (vcom->v) + vcom->v->volumeID = 0; + +#ifdef AFS_DEMAND_ATTACH_FS + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); + if (vp && vp->pending_vol_op) { + VDeregisterVolOp_r(vp, vp->pending_vol_op); + } +#endif + + return SYNC_OK; +} + +static afs_int32 +FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + /* if the volume is being restored, break all callbacks on it */ + if (V_BreakVolumeCallbacks) { + Log("fssync: breaking all call backs for volume %u\n", + vcom->vop->volume); + VOL_UNLOCK; + (*V_BreakVolumeCallbacks) (vcom->vop->volume); + VOL_LOCK; + } + return SYNC_OK; +} + +static afs_int32 +FSYNC_com_VolQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + Error error; + Volume * vp; + +#ifdef AFS_DEMAND_ATTACH_FS + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); +#else /* !AFS_DEMAND_ATTACH_FS */ + vp = VGetVolume_r(&error, vcom->vop->volume); +#endif /* !AFS_DEMAND_ATTACH_FS */ + + if (vp) { + assert(sizeof(Volume) <= res->payload.len); + memcpy(res->payload.buf, vp, sizeof(Volume)); + res->hdr.response_len += sizeof(Volume); +#ifndef AFS_DEMAND_ATTACH_FS + VPutVolume_r(vp); +#endif + } else { + res->hdr.reason = FSYNC_UNKNOWN_VOLID; + code = SYNC_FAILED; + } + return code; +} + +static afs_int32 +FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + Error error; + Volume * vp; + int hdr_ok = 0; + +#ifdef AFS_DEMAND_ATTACH_FS + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); + if (vp && + (vp->header != NULL) && + (V_attachFlags(vp) & VOL_HDR_ATTACHED) && + (V_attachFlags(vp) & VOL_HDR_LOADED)) { + hdr_ok = 1; + } +#else /* !AFS_DEMAND_ATTACH_FS */ + vp = VGetVolume_r(&error, vcom->vop->volume); + if (vp && vp->header) { + hdr_ok = 1; + } +#endif /* !AFS_DEMAND_ATTACH_FS */ + + load_done: + if (hdr_ok) { + assert(sizeof(VolumeDiskData) <= res->payload.len); + memcpy(res->payload.buf, &V_disk(vp), sizeof(VolumeDiskData)); + res->hdr.response_len += sizeof(VolumeDiskData); +#ifndef AFS_DEMAND_ATTACH_FS + VPutVolume_r(vp); +#endif + } else { + if (vp) { + res->hdr.reason = FSYNC_HDR_NOT_ATTACHED; + } else { + res->hdr.reason = FSYNC_UNKNOWN_VOLID; + } + code = SYNC_FAILED; + } + return code; +} + +#ifdef AFS_DEMAND_ATTACH_FS +static afs_int32 +FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + Error error; + Volume * vp; + + vp = VLookupVolume_r(&error, vcom->vop->volume, NULL); + + if (vp && vp->pending_vol_op) { + assert(sizeof(FSSYNC_VolOp_info) <= res->payload.len); + memcpy(res->payload.buf, vp->pending_vol_op, sizeof(FSSYNC_VolOp_info)); + res->hdr.response_len += sizeof(FSSYNC_VolOp_info); + } else { + if (vp) { + res->hdr.reason = FSYNC_NO_PENDING_VOL_OP; + } else { + res->hdr.reason = FSYNC_UNKNOWN_VOLID; + } + code = SYNC_FAILED; + } + return code; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +static afs_int32 +FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res) +{ + int i; + afs_int32 code = SYNC_OK; + FSSYNC_StatsOp_command scom; + + if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_StatsOp_hdr))) { + res->hdr.reason = SYNC_REASON_MALFORMED_PACKET; + res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + return SYNC_COM_ERROR; + } + + scom.hdr = &com->hdr; + scom.sop = (FSSYNC_StatsOp_hdr *) com->payload.buf; + scom.com = com; + + switch (com->hdr.command) { + case FSYNC_VOL_STATS_GENERAL: + code = FSYNC_com_StatsOpGeneral(&scom, res); + break; +#ifdef AFS_DEMAND_ATTACH_FS + /* statistics for the following subsystems are only tracked + * for demand attach fileservers */ + case FSYNC_VOL_STATS_VICEP: + code = FSYNC_com_StatsOpViceP(&scom, res); + break; + case FSYNC_VOL_STATS_HASH: + code = FSYNC_com_StatsOpHash(&scom, res); + break; + case FSYNC_VOL_STATS_HDR: + code = FSYNC_com_StatsOpHdr(&scom, res); + break; + case FSYNC_VOL_STATS_VLRU: + code = FSYNC_com_StatsOpVLRU(&scom, res); + break; +#endif /* AFS_DEMAND_ATTACH_FS */ + default: + code = SYNC_BAD_COMMAND; + } + + return code; +} + +static afs_int32 +FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + + memcpy(res->payload.buf, &VStats, sizeof(VStats)); + res->hdr.response_len += sizeof(VStats); + + return code; +} + +#ifdef AFS_DEMAND_ATTACH_FS +static afs_int32 +FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct DiskPartition * dp; + struct DiskPartitionStats * stats; + + if (SYNC_verifyProtocolString(scom->sop->args.partName, sizeof(scom->sop->args.partName))) { + res->hdr.reason = SYNC_REASON_MALFORMED_PACKET; + code = SYNC_FAILED; + goto done; + } + + dp = VGetPartition_r(scom->sop->args.partName, 0); + if (!dp) { + code = SYNC_FAILED; + } else { + stats = (struct DiskPartitionStats *) res->payload.buf; + stats->free = dp->free; + stats->totalUsable = dp->totalUsable; + stats->minFree = dp->minFree; + stats->f_files = dp->f_files; + stats->vol_list_len = dp->vol_list.len; + + res->hdr.response_len += sizeof(struct DiskPartitionStats); + } + + done: + return code; +} + +static afs_int32 +FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct VolumeHashChainStats * stats; + struct VolumeHashChainHead * head; + + if (scom->sop->args.hash_bucket >= VolumeHashTable.Size) { + return SYNC_FAILED; + } + + head = &VolumeHashTable.Table[scom->sop->args.hash_bucket]; + stats = (struct VolumeHashChainStats *) res->payload.buf; + stats->table_size = VolumeHashTable.Size; + stats->chain_len = head->len; + stats->chain_cacheCheck = head->cacheCheck; + stats->chain_busy = head->busy; + AssignInt64(head->looks, &stats->chain_looks); + AssignInt64(head->gets, &stats->chain_gets); + AssignInt64(head->reorders, &stats->chain_reorders); + + res->hdr.response_len += sizeof(struct VolumeHashChainStats); + + return code; +} + +static afs_int32 +FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + + memcpy(res->payload.buf, &volume_hdr_LRU.stats, sizeof(volume_hdr_LRU.stats)); + res->hdr.response_len += sizeof(volume_hdr_LRU.stats); + + return code; +} + +static afs_int32 +FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res) +{ + afs_int32 code = SYNC_OK; + + code = SYNC_BAD_COMMAND; + + return code; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +static void +FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info) +{ + memcpy(&info->com, vcom->hdr, sizeof(SYNC_command_hdr)); + memcpy(&info->vop, vcom->vop, sizeof(FSSYNC_VolOp_hdr)); +} + +static void +FSYNC_Drop(int fd) +{ + struct offlineInfo *p; + int i; + Error error; + char tvolName[VMAXPATHLEN]; + + VOL_LOCK; + p = OfflineVolumes[FindHandler(fd)]; + for (i = 0; i < MAXOFFLINEVOLUMES; i++) { + if (p[i].volumeID) { + + Volume *vp; + + tvolName[0] = '/'; + sprintf(&tvolName[1], VFORMAT, p[i].volumeID); + vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName, + V_VOLUPD); + if (vp) + VPutVolume_r(vp); + p[i].volumeID = 0; + } + } + VOL_UNLOCK; + RemoveHandler(fd); +#ifdef AFS_NT40_ENV + closesocket(fd); +#else + close(fd); +#endif + AcceptOn(); +} + +static int AcceptHandler = -1; /* handler id for accept, if turned on */ + +static void +AcceptOn() +{ + if (AcceptHandler == -1) { + assert(AddHandler(AcceptSd, FSYNC_newconnection)); + AcceptHandler = FindHandler(AcceptSd); + } +} + +static void +AcceptOff() +{ + if (AcceptHandler != -1) { + assert(RemoveHandler(AcceptSd)); + AcceptHandler = -1; + } +} + +/* The multiple FD handling code. */ + +static int HandlerFD[MAXHANDLERS]; +static int (*HandlerProc[MAXHANDLERS]) (); + +static void +InitHandler() +{ + register int i; + ObtainWriteLock(&FSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) { + HandlerFD[i] = -1; + HandlerProc[i] = 0; + } + ReleaseWriteLock(&FSYNC_handler_lock); +} + +static void +CallHandler(fd_set * fdsetp) +{ + register int i; + ObtainReadLock(&FSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) { + if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) { + ReleaseReadLock(&FSYNC_handler_lock); + (*HandlerProc[i]) (HandlerFD[i]); + ObtainReadLock(&FSYNC_handler_lock); + } + } + ReleaseReadLock(&FSYNC_handler_lock); +} + +static int +AddHandler(int afd, int (*aproc) ()) +{ + register int i; + ObtainWriteLock(&FSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == -1) + break; + if (i >= MAXHANDLERS) { + ReleaseWriteLock(&FSYNC_handler_lock); + return 0; + } + HandlerFD[i] = afd; + HandlerProc[i] = aproc; + ReleaseWriteLock(&FSYNC_handler_lock); + return 1; +} + +static int +FindHandler(register int afd) +{ + register int i; + ObtainReadLock(&FSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == afd) { + ReleaseReadLock(&FSYNC_handler_lock); + return i; + } + ReleaseReadLock(&FSYNC_handler_lock); /* just in case */ + assert(1 == 2); + return -1; /* satisfy compiler */ +} + +static int +FindHandler_r(register int afd) +{ + register int i; + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == afd) { + return i; + } + assert(1 == 2); + return -1; /* satisfy compiler */ +} + +static int +RemoveHandler(register int afd) +{ + ObtainWriteLock(&FSYNC_handler_lock); + HandlerFD[FindHandler_r(afd)] = -1; + ReleaseWriteLock(&FSYNC_handler_lock); + return 1; +} + +static void +GetHandler(fd_set * fdsetp, int *maxfdp) +{ + register int i; + register int maxfd = -1; + FD_ZERO(fdsetp); + ObtainReadLock(&FSYNC_handler_lock); /* just in case */ + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] != -1) { + FD_SET(HandlerFD[i], fdsetp); + if (maxfd < HandlerFD[i]) + maxfd = HandlerFD[i]; + } + *maxfdp = maxfd; + ReleaseReadLock(&FSYNC_handler_lock); /* just in case */ +} + +#endif /* FSSYNC_BUILD_SERVER */ diff --git a/src/vol/fssync.c b/src/vol/fssync.c deleted file mode 100644 index 714aaf5fea..0000000000 --- a/src/vol/fssync.c +++ /dev/null @@ -1,751 +0,0 @@ -/* - * Copyright 2000, International Business Machines Corporation and others. - * All Rights Reserved. - * - * This software has been released under the terms of the IBM Public - * License. For details, see the LICENSE file in the top-level source - * directory or online at http://www.openafs.org/dl/license10.html - */ - -/* - System: VICE-TWO - Module: fssync.c - Institution: The Information Technology Center, Carnegie-Mellon University - - */ -#ifdef notdef - -/* All this is going away in early 1989 */ -int newVLDB; /* Compatibility flag */ - -#endif -static int newVLDB = 1; - - -#ifndef AFS_PTHREAD_ENV -#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2) - -/* - * stack size increased from 8K because the HP machine seemed to have trouble - * with the smaller stack - */ -#define USUAL_STACK_SIZE (24 * 1024) -#endif /* !AFS_PTHREAD_ENV */ - -/* - fsync.c - File server synchronization with external volume utilities. - */ - -/* This controls the size of an fd_set; it must be defined early before - * the system headers define that type and the macros that operate on it. - * Its value should be as large as the maximum file descriptor limit we - * are likely to run into on any platform. Right now, that is 65536 - * which is the default hard fd limit on Solaris 9 */ -#ifndef _WIN32 -#define FD_SETSIZE 65536 -#endif - -#include -#include - -RCSID - ("$Header$"); - -#include -#include -#ifdef AFS_NT40_ENV -#include -#include -#else -#include -#include -#include -#include -#include -#endif -#include -#ifdef AFS_PTHREAD_ENV -#include -#else /* AFS_PTHREAD_ENV */ -#include -#endif /* AFS_PTHREAD_ENV */ -#include - -#ifdef HAVE_STRING_H -#include -#else -#ifdef HAVE_STRINGS_H -#include -#endif -#endif - - -#include -#include -#include "nfs.h" -#include -#include "fssync.h" -#include "lwp.h" -#include "lock.h" -#include -#include "ihandle.h" -#include "vnode.h" -#include "volume.h" -#include "partition.h" - -/*@printflike@*/ extern void Log(const char *format, ...); - -#ifdef osi_Assert -#undef osi_Assert -#endif -#define osi_Assert(e) (void)(e) - -int (*V_BreakVolumeCallbacks) (); - -#define MAXHANDLERS 4 /* Up to 4 clients; must be at least 2, so that - * move = dump+restore can run on single server */ -#define MAXOFFLINEVOLUMES 128 /* This needs to be as big as the maximum - * number that would be offline for 1 operation. - * Current winner is salvage, which needs all - * cloned read-only copies offline when salvaging - * a single read-write volume */ - -#define MAX_BIND_TRIES 5 /* Number of times to retry socket bind */ - - -struct offlineInfo { - VolumeId volumeID; - char partName[16]; -}; - -static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES]; - -static FS_sd = -1; /* Client socket for talking to file server */ -static AcceptSd = -1; /* Socket used by server for accepting connections */ - -static int getport(); - -struct command { - bit32 command; - bit32 reason; - VolumeId volume; - char partName[16]; /* partition name, e.g. /vicepa */ -}; - -/* Forward declarations */ -static void FSYNC_sync(); -static void FSYNC_newconnection(); -static void FSYNC_com(); -static void FSYNC_Drop(); -static void AcceptOn(); -static void AcceptOff(); -static void InitHandler(); -static void CallHandler(fd_set * fdsetp); -static int AddHandler(); -static int FindHandler(); -static int FindHandler_r(); -static int RemoveHandler(); -static void GetHandler(fd_set * fdsetp, int *maxfdp); - -extern int LogLevel; - -/* - * This lock controls access to the handler array. The overhead - * is minimal in non-preemptive environments. - */ -struct Lock FSYNC_handler_lock; - -int -FSYNC_clientInit(void) -{ - struct sockaddr_in addr; - /* I can't believe the following is needed for localhost connections!! */ - static time_t backoff[] = - { 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 }; - time_t *timeout = &backoff[0]; - - for (;;) { - FS_sd = getport(&addr); - if (connect(FS_sd, (struct sockaddr *)&addr, sizeof(addr)) >= 0) - return 1; - if (!*timeout) - break; - if (!(*timeout & 1)) - Log("FSYNC_clientInit temporary failure (will retry)"); - FSYNC_clientFinis(); - sleep(*timeout++); - } - perror("FSYNC_clientInit failed (giving up!)"); - return 0; -} - -void -FSYNC_clientFinis(void) -{ -#ifdef AFS_NT40_ENV - closesocket(FS_sd); -#else - close(FS_sd); -#endif - FS_sd = -1; -} - -int -FSYNC_askfs(VolumeId volume, char *partName, int com, int reason) -{ - byte response; - struct command command; - int n; - command.volume = volume; - command.command = com; - command.reason = reason; - if (partName) - strcpy(command.partName, partName); - else - command.partName[0] = 0; - assert(FS_sd != -1); - VFSYNC_LOCK; -#ifdef AFS_NT40_ENV - if (send(FS_sd, (char *)&command, sizeof(command), 0) != sizeof(command)) { - printf("FSYNC_askfs: write to file server failed\n"); - response = FSYNC_DENIED; - goto done; - } - while ((n = recv(FS_sd, &response, 1, 0)) != 1) { - if (n == 0 || WSAEINTR != WSAGetLastError()) { - printf("FSYNC_askfs: No response from file server\n"); - response = FSYNC_DENIED; - goto done; - } - } -#else - if (write(FS_sd, &command, sizeof(command)) != sizeof(command)) { - printf("FSYNC_askfs: write to file server failed\n"); - response = FSYNC_DENIED; - goto done; - } - while ((n = read(FS_sd, &response, 1)) != 1) { - if (n == 0 || errno != EINTR) { - printf("FSYNC_askfs: No response from file server\n"); - response = FSYNC_DENIED; - goto done; - } - } -#endif - if (response == 0) { - printf - ("FSYNC_askfs: negative response from file server; volume %u, command %d\n", - command.volume, (int)command.command); - } - done: - VFSYNC_UNLOCK; - return response; -} - -void -FSYNC_fsInit(void) -{ -#ifdef AFS_PTHREAD_ENV - pthread_t tid; - pthread_attr_t tattr; - assert(pthread_attr_init(&tattr) == 0); - assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0); - assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0); -#else /* AFS_PTHREAD_ENV */ - PROCESS pid; - assert(LWP_CreateProcess - (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0, - "FSYNC_sync", &pid) == LWP_SUCCESS); -#endif /* AFS_PTHREAD_ENV */ -} - -static int -getport(struct sockaddr_in *addr) -{ - int sd; - - memset(addr, 0, sizeof(*addr)); - assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); -#ifdef STRUCT_SOCKADDR_HAS_SA_LEN - addr->sin_len = sizeof(struct sockaddr_in); -#endif - addr->sin_addr.s_addr = htonl(0x7f000001); - addr->sin_family = AF_INET; /* was localhost->h_addrtype */ - addr->sin_port = htons(2040); /* XXXX htons not _really_ neccessary */ - - return sd; -} - -static fd_set FSYNC_readfds; - -static void -FSYNC_sync() -{ - struct sockaddr_in addr; - int on = 1; - extern VInit; - int code; - int numTries; -#ifdef AFS_PTHREAD_ENV - int tid; -#endif - -#ifndef AFS_NT40_ENV - (void)signal(SIGPIPE, SIG_IGN); -#endif - -#ifdef AFS_PTHREAD_ENV - /* set our 'thread-id' so that the host hold table works */ - MUTEX_ENTER(&rx_stats_mutex); /* protects rxi_pthread_hinum */ - tid = ++rxi_pthread_hinum; - MUTEX_EXIT(&rx_stats_mutex); - pthread_setspecific(rx_thread_id_key, (void *)tid); - Log("Set thread id %d for FSYNC_sync\n", tid); -#endif /* AFS_PTHREAD_ENV */ - - while (!VInit) { - /* Let somebody else run until level > 0. That doesn't mean that - * all volumes have been attached. */ -#ifdef AFS_PTHREAD_ENV - pthread_yield(); -#else /* AFS_PTHREAD_ENV */ - LWP_DispatchProcess(); -#endif /* AFS_PTHREAD_ENV */ - } - AcceptSd = getport(&addr); - /* Reuseaddr needed because system inexplicably leaves crud lying around */ - code = - setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on, - sizeof(on)); - if (code) - Log("FSYNC_sync: setsockopt failed with (%d)\n", errno); - - for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) { - if ((code = - bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0) - break; - Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n", - errno); - sleep(5); - } - assert(!code); - listen(AcceptSd, 100); - InitHandler(); - AcceptOn(); - for (;;) { - int maxfd; - GetHandler(&FSYNC_readfds, &maxfd); - /* Note: check for >= 1 below is essential since IOMGR_select - * doesn't have exactly same semantics as select. - */ -#ifdef AFS_PTHREAD_ENV - if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1) -#else /* AFS_PTHREAD_ENV */ - if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1) -#endif /* AFS_PTHREAD_ENV */ - CallHandler(&FSYNC_readfds); - } -} - -static void -FSYNC_newconnection(int afd) -{ - struct sockaddr_in other; - int junk, fd; - junk = sizeof(other); - fd = accept(afd, (struct sockaddr *)&other, &junk); - if (fd == -1) { - Log("FSYNC_newconnection: accept failed, errno==%d\n", errno); - assert(1 == 2); - } else if (!AddHandler(fd, FSYNC_com)) { - AcceptOff(); - assert(AddHandler(fd, FSYNC_com)); - } -} - -/* -#define TEST2081 -*/ - -afs_int32 FS_cnt = 0; -static void -FSYNC_com(int fd) -{ - byte rc = FSYNC_OK; - int n, i; - Error error; - struct command command; - int leaveonline; - register struct offlineInfo *volumes, *v; - Volume *vp; - char tvolName[VMAXPATHLEN]; - - FS_cnt++; -#ifdef AFS_NT40_ENV - n = recv(fd, &command, sizeof(command), 0); -#else - n = read(fd, &command, sizeof(command)); -#endif - if (n <= 0) { - FSYNC_Drop(fd); - return; - } - if (n < sizeof(command)) { - Log("FSYNC_com: partial read (%d instead of %d); dropping connection (cnt=%d)\n", n, sizeof(command), FS_cnt); - FSYNC_Drop(fd); - return; - } - VATTACH_LOCK; - VOL_LOCK; - volumes = OfflineVolumes[FindHandler(fd)]; - for (v = 0, i = 0; i < MAXOFFLINEVOLUMES; i++) { - if (volumes[i].volumeID == command.volume - && strcmp(volumes[i].partName, command.partName) == 0) { - v = &volumes[i]; - break; - } - } - switch (command.command) { - case FSYNC_DONE: - /* don't try to put online, this call is made only after deleting - * a volume, in which case we want to remove the vol # from the - * OfflineVolumes array only */ - if (v) - v->volumeID = 0; - break; - case FSYNC_ON: - -/* -This is where a detatched volume gets reattached. However in the -special case where the volume is merely busy, it is already -attatched and it is only necessary to clear the busy flag. See -defect #2080 for details. -*/ - - /* is the volume already attatched? */ -#ifdef notdef -/* - * XXX With the following enabled we had bizarre problems where the backup id would - * be reset to 0; that was due to the interaction between fileserver/volserver in that they - * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of - * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call - * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to - * be done right XXX - */ - vp = VGetVolume_r(&error, command.volume); - if (vp) { - /* yep, is the BUSY flag set? */ - if (vp->specialStatus == VBUSY) { -/* test harness for defect #2081 */ - -#ifdef TEST2081 - /* - * test #2081 by releasing TEST.2081, - * so leave it alone here, zap it after - */ - - if (strcmp(vp->header->diskstuff.name, "TEST.2081") == 0) - break; -#endif - /* yep, clear BUSY flag */ - - vp->specialStatus = 0; - /* make sure vol is online */ - if (v) { - v->volumeID = 0; - V_inUse(vp) = 1; /* online */ - } - VPutVolume_r(vp); - break; - } - VPutVolume_r(vp); - } -#endif - - /* so, we need to attach the volume */ - - if (v) - v->volumeID = 0; - tvolName[0] = '/'; - sprintf(&tvolName[1], VFORMAT, command.volume); - - vp = VAttachVolumeByName_r(&error, command.partName, tvolName, - V_VOLUPD); - if (vp) - VPutVolume_r(vp); - break; - case FSYNC_OFF: - case FSYNC_NEEDVOLUME:{ - leaveonline = 0; - /* not already offline, we need to find a slot for newly offline volume */ - if (!v) { - for (i = 0; i < MAXOFFLINEVOLUMES; i++) { - if (volumes[i].volumeID == 0) { - v = &volumes[i]; - break; - } - } - } - if (!v) { - rc = FSYNC_DENIED; - break; - } - vp = VGetVolume_r(&error, command.volume); - if (vp) { - if (command.partName[0] != 0 - && strcmp(command.partName, vp->partition->name) != 0) { - /* volume on desired partition is not online, so we - * should treat this as an offline volume. - */ - VPutVolume_r(vp); - vp = (Volume *) 0; - } - } - if (vp) { - leaveonline = (command.command == FSYNC_NEEDVOLUME - && (command.reason == V_READONLY - || (!VolumeWriteable(vp) - && (command.reason == V_CLONE - || command.reason == V_DUMP)) - ) - ); - if (!leaveonline) { - if (command.command == FSYNC_NEEDVOLUME - && (command.reason == V_CLONE - || command.reason == V_DUMP)) { - vp->specialStatus = VBUSY; - } - /* remember what volume we got, so we can keep track of how - * many volumes the volserver or whatever is using. Note that - * vp is valid since leaveonline is only set when vp is valid. - */ - v->volumeID = command.volume; - strcpy(v->partName, vp->partition->name); - if (!V_inUse(vp)) { - /* in this case, VOffline just returns sans decrementing - * ref count. We could try to fix it, but it has lots of - * weird callers. - */ - VPutVolume_r(vp); - } else { - VOffline_r(vp, "A volume utility is running."); - } - vp = 0; - } else { - VUpdateVolume_r(&error, vp); /* At least get volume stats right */ - if (LogLevel) { - Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", V_id(vp), V_name(vp), command.reason == V_CLONE ? "clone" : command.reason == V_READONLY ? "readonly" : command.reason == V_DUMP ? "dump" : "UNKNOWN"); - } - } - if (vp) - VPutVolume_r(vp); - } - rc = FSYNC_OK; - break; - } - case FSYNC_MOVEVOLUME: - /* Yuch: the "reason" for the move is the site it got moved to... */ - /* still set specialStatus so we stop sending back VBUSY. - * also should still break callbacks. Note that I don't know - * how to tell if we should break all or not, so we just do it - * since it doesn't matter much if we do an extra break - * volume callbacks on a volume move within the same server */ - vp = VGetVolume_r(&error, command.volume); - if (vp) { - vp->specialStatus = VMOVED; - VPutVolume_r(vp); - } - - if (V_BreakVolumeCallbacks) { - Log("fssync: volume %u moved to %x; breaking all call backs\n", - command.volume, command.reason); - VOL_UNLOCK; - VATTACH_UNLOCK; - (*V_BreakVolumeCallbacks) (command.volume); - VATTACH_LOCK; - VOL_LOCK; - } - break; - case FSYNC_RESTOREVOLUME: - /* if the volume is being restored, break all callbacks on it */ - if (V_BreakVolumeCallbacks) { - Log("fssync: volume %u restored; breaking all call backs\n", - command.volume); - VOL_UNLOCK; - VATTACH_UNLOCK; - (*V_BreakVolumeCallbacks) (command.volume); - VATTACH_LOCK; - VOL_LOCK; - } - break; - default: - rc = FSYNC_DENIED; - break; - } - VOL_UNLOCK; - VATTACH_UNLOCK; -#ifdef AFS_NT40_ENV - (void)send(fd, &rc, 1, 0); -#else - (void)write(fd, &rc, 1); -#endif -} - -static void -FSYNC_Drop(int fd) -{ - struct offlineInfo *p; - register i; - Error error; - char tvolName[VMAXPATHLEN]; - - VATTACH_LOCK; - VOL_LOCK; - p = OfflineVolumes[FindHandler(fd)]; - for (i = 0; i < MAXOFFLINEVOLUMES; i++) { - if (p[i].volumeID) { - Volume *vp; - - tvolName[0] = '/'; - sprintf(&tvolName[1], VFORMAT, p[i].volumeID); - vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName, - V_VOLUPD); - if (vp) - VPutVolume_r(vp); - p[i].volumeID = 0; - } - } - VOL_UNLOCK; - VATTACH_UNLOCK; - RemoveHandler(fd); -#ifdef AFS_NT40_ENV - closesocket(fd); -#else - close(fd); -#endif - AcceptOn(); -} - -static int AcceptHandler = -1; /* handler id for accept, if turned on */ - -static void -AcceptOn() -{ - if (AcceptHandler == -1) { - assert(AddHandler(AcceptSd, FSYNC_newconnection)); - AcceptHandler = FindHandler(AcceptSd); - } -} - -static void -AcceptOff() -{ - if (AcceptHandler != -1) { - assert(RemoveHandler(AcceptSd)); - AcceptHandler = -1; - } -} - -/* The multiple FD handling code. */ - -static int HandlerFD[MAXHANDLERS]; -static int (*HandlerProc[MAXHANDLERS]) (); - -static void -InitHandler() -{ - register int i; - ObtainWriteLock(&FSYNC_handler_lock); - for (i = 0; i < MAXHANDLERS; i++) { - HandlerFD[i] = -1; - HandlerProc[i] = 0; - } - ReleaseWriteLock(&FSYNC_handler_lock); -} - -static void -CallHandler(fd_set * fdsetp) -{ - register int i; - ObtainReadLock(&FSYNC_handler_lock); - for (i = 0; i < MAXHANDLERS; i++) { - if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) { - ReleaseReadLock(&FSYNC_handler_lock); - (*HandlerProc[i]) (HandlerFD[i]); - ObtainReadLock(&FSYNC_handler_lock); - } - } - ReleaseReadLock(&FSYNC_handler_lock); -} - -static int -AddHandler(int afd, int (*aproc) ()) -{ - register int i; - ObtainWriteLock(&FSYNC_handler_lock); - for (i = 0; i < MAXHANDLERS; i++) - if (HandlerFD[i] == -1) - break; - if (i >= MAXHANDLERS) { - ReleaseWriteLock(&FSYNC_handler_lock); - return 0; - } - HandlerFD[i] = afd; - HandlerProc[i] = aproc; - ReleaseWriteLock(&FSYNC_handler_lock); - return 1; -} - -static int -FindHandler(register int afd) -{ - register int i; - ObtainReadLock(&FSYNC_handler_lock); - for (i = 0; i < MAXHANDLERS; i++) - if (HandlerFD[i] == afd) { - ReleaseReadLock(&FSYNC_handler_lock); - return i; - } - ReleaseReadLock(&FSYNC_handler_lock); /* just in case */ - assert(1 == 2); - return -1; /* satisfy compiler */ -} - -static int -FindHandler_r(register int afd) -{ - register int i; - for (i = 0; i < MAXHANDLERS; i++) - if (HandlerFD[i] == afd) { - return i; - } - assert(1 == 2); - return -1; /* satisfy compiler */ -} - -static int -RemoveHandler(register int afd) -{ - ObtainWriteLock(&FSYNC_handler_lock); - HandlerFD[FindHandler_r(afd)] = -1; - ReleaseWriteLock(&FSYNC_handler_lock); - return 1; -} - -static void -GetHandler(fd_set * fdsetp, int *maxfdp) -{ - register int i; - register int maxfd = -1; - FD_ZERO(fdsetp); - ObtainReadLock(&FSYNC_handler_lock); /* just in case */ - for (i = 0; i < MAXHANDLERS; i++) - if (HandlerFD[i] != -1) { - FD_SET(HandlerFD[i], fdsetp); - if (maxfd < HandlerFD[i]) - maxfd = HandlerFD[i]; - } - *maxfdp = maxfd; - ReleaseReadLock(&FSYNC_handler_lock); /* just in case */ -} diff --git a/src/vol/fssync.h b/src/vol/fssync.h index af5ab02c71..873b274970 100644 --- a/src/vol/fssync.h +++ b/src/vol/fssync.h @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -14,38 +16,117 @@ */ - -/* FSYNC commands */ - -#define FSYNC_ON 1 /* Volume online */ -#define FSYNC_OFF 2 /* Volume offline */ -#define FSYNC_LISTVOLUMES 3 /* Update local volume list */ -#define FSYNC_NEEDVOLUME 4 /* Put volume in whatever mode (offline, or whatever) - * best fits the attachment mode provided in reason */ -#define FSYNC_MOVEVOLUME 5 /* Generate temporary relocation information - * for this volume to another site, to be used - * if this volume disappears */ -#define FSYNC_RESTOREVOLUME 6 /* Break all the callbacks on this volume since it is being restored */ -#define FSYNC_DONE 7 /* Done with this volume (used after a delete). - * Don't put online, but remove from list */ +#ifndef __fssync_h_ +#define __fssync_h_ -/* Reasons (these could be communicated to venus or converted to messages) */ - -#define FSYNC_WHATEVER 0 /* XXXX */ -#define FSYNC_SALVAGE 1 /* volume is being salvaged */ -#define FSYNC_MOVE 2 /* volume is being moved */ -#define FSYNC_OPERATOR 3 /* operator forced volume offline */ +#define FSYNC_PROTO_VERSION 2 -/* Replies (1 byte) */ +/* FSYNC command codes */ +#define FSYNC_VOL_ON SYNC_COM_CODE_DECL(0) /* Volume online */ +#define FSYNC_VOL_OFF SYNC_COM_CODE_DECL(1) /* Volume offline */ +#define FSYNC_VOL_LISTVOLUMES SYNC_COM_CODE_DECL(2) /* Update local volume list */ +#define FSYNC_VOL_NEEDVOLUME SYNC_COM_CODE_DECL(3) /* Put volume in whatever mode (offline, or whatever) + * best fits the attachment mode provided in reason */ +#define FSYNC_VOL_MOVE SYNC_COM_CODE_DECL(4) /* Generate temporary relocation information + * for this volume to another site, to be used + * if this volume disappears */ +#define FSYNC_VOL_BREAKCBKS SYNC_COM_CODE_DECL(5) /* Break all the callbacks on this volume */ +#define FSYNC_VOL_DONE SYNC_COM_CODE_DECL(6) /* Done with this volume (used after a delete). + * Don't put online, but remove from list */ +#define FSYNC_VOL_QUERY SYNC_COM_CODE_DECL(7) /* query the volume state */ +#define FSYNC_VOL_QUERY_HDR SYNC_COM_CODE_DECL(8) /* query the volume disk data structure */ +#define FSYNC_VOL_QUERY_VOP SYNC_COM_CODE_DECL(9) /* query the volume for pending vol op info */ +#define FSYNC_VOL_STATS_GENERAL SYNC_COM_CODE_DECL(10) /* query the general volume package statistics */ +#define FSYNC_VOL_STATS_VICEP SYNC_COM_CODE_DECL(11) /* query the per-partition volume package stats */ +#define FSYNC_VOL_STATS_HASH SYNC_COM_CODE_DECL(12) /* query the per hash-chain volume package stats */ +#define FSYNC_VOL_STATS_HDR SYNC_COM_CODE_DECL(13) /* query the volume header cache statistics */ +#define FSYNC_VOL_STATS_VLRU SYNC_COM_CODE_DECL(14) /* query the VLRU statistics */ -#define FSYNC_DENIED 0 -#define FSYNC_OK 1 +/* FSYNC reason codes */ +#define FSYNC_WHATEVER SYNC_REASON_CODE_DECL(0) /* XXXX */ +#define FSYNC_SALVAGE SYNC_REASON_CODE_DECL(1) /* volume is being salvaged */ +#define FSYNC_MOVE SYNC_REASON_CODE_DECL(2) /* volume is being moved */ +#define FSYNC_OPERATOR SYNC_REASON_CODE_DECL(3) /* operator forced volume offline */ +#define FSYNC_EXCLUSIVE SYNC_REASON_CODE_DECL(4) /* somebody else has the volume offline */ +#define FSYNC_UNKNOWN_VOLID SYNC_REASON_CODE_DECL(5) /* volume id not known by fileserver */ +#define FSYNC_HDR_NOT_ATTACHED SYNC_REASON_CODE_DECL(6) /* volume header not currently attached */ +#define FSYNC_NO_PENDING_VOL_OP SYNC_REASON_CODE_DECL(7) /* no volume operation pending */ +#define FSYNC_VOL_PKG_ERROR SYNC_REASON_CODE_DECL(8) /* error in the volume package */ + +/* FSYNC response codes */ + +/* FSYNC flag codes */ -/* Prototypes from fssync.c */ -void FSYNC_clientFinis(void); -int FSYNC_clientInit(void); -void FSYNC_fsInit(void); -int FSYNC_askfs(VolumeId volume, char *partName, int com, int reason); + +struct offlineInfo { + afs_uint32 volumeID; + char partName[16]; +}; + +typedef struct FSSYNC_VolOp_hdr { + afs_uint32 volume; /* volume id associated with request */ + char partName[16]; /* partition name, e.g. /vicepa */ +} FSSYNC_VolOp_hdr; + +typedef struct FSSYNC_VolOp_command { + SYNC_command_hdr * hdr; + FSSYNC_VolOp_hdr * vop; + SYNC_command * com; + struct offlineInfo * v; + struct offlineInfo * volumes; +} FSSYNC_VolOp_command; + +typedef struct FSSYNC_VolOp_info { + SYNC_command_hdr com; + FSSYNC_VolOp_hdr vop; +} FSSYNC_VolOp_info; + + +typedef struct FSSYNC_StatsOp_hdr { + union { + afs_uint32 vlru_generation; + afs_uint32 hash_bucket; + char partName[16]; + } args; +} FSSYNC_StatsOp_hdr; + +typedef struct FSSYNC_StatsOp_command { + SYNC_command_hdr * hdr; + FSSYNC_StatsOp_hdr * sop; + SYNC_command * com; +} FSSYNC_StatsOp_command; + + + +/* + * common interfaces + */ +extern void FSYNC_Init(void); + +/* + * fsync client interfaces + */ +extern void FSYNC_clientFinis(void); +extern int FSYNC_clientInit(void); +extern int FSYNC_clientChildProcReconnect(void); + +/* generic low-level interface */ +extern afs_int32 FSYNC_askfs(SYNC_command * com, SYNC_response * res); + +/* generic higher-level interface */ +extern afs_int32 FSYNC_GenericOp(void * ext_hdr, size_t ext_len, + int command, int reason, + SYNC_response * res); + +/* volume operations interface */ +extern afs_int32 FSYNC_VolOp(VolumeId volume, char *partName, int com, int reason, + SYNC_response * res); + +/* statistics query interface */ +extern afs_int32 FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason, + SYNC_response * res_in); + +#endif /* __fssync_h_ */ diff --git a/src/vol/nuke.c b/src/vol/nuke.c index f787b5ae39..5b52e46a06 100644 --- a/src/vol/nuke.c +++ b/src/vol/nuke.c @@ -41,6 +41,7 @@ RCSID #include "partition.h" #include "viceinode.h" #include "salvage.h" +#include "daemon_com.h" #include "fssync.h" #ifdef O_LARGEFILE diff --git a/src/vol/partition.c b/src/vol/partition.c index f8aa3a81dd..9eea9f577d 100644 --- a/src/vol/partition.c +++ b/src/vol/partition.c @@ -7,6 +7,7 @@ * directory or online at http://www.openafs.org/dl/license10.html * * Portions Copyright (c) 2003 Apple Computer, Inc. + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -189,6 +190,14 @@ RCSID int aixlow_water = 8; /* default 8% */ struct DiskPartition *DiskPartitionList; +#ifdef AFS_DEMAND_ATTACH_FS +static struct DiskPartition *DiskPartitionTable[VOLMAXPARTS+1]; + +static struct DiskPartition * VLookupPartition_r(char * path); +static void AddPartitionToTable_r(struct DiskPartition *); +static void DeletePartitionFromTable_r(struct DiskPartition *); +#endif /* AFS_DEMAND_ATTACH_FS */ + #ifdef AFS_SGI_XFS_IOPS_ENV /* Verify that the on disk XFS inodes on the partition are large enough to * hold the AFS attribute. Returns -1 if the attribute can't be set or is @@ -225,8 +234,16 @@ VerifyXFSInodeSize(char *part, char *fstype) } return code; } -#endif +#endif /* AFS_SGI_XFS_IOPS_ENV */ +int +VInitPartitionPackage(void) +{ +#ifdef AFS_DEMAND_ATTACH_ENV + memset(&DiskPartitionTable, 0, sizeof(DiskPartitionTable)); +#endif /* AFS_DEMAND_ATTACH_ENV */ + return 0; +} static void VInitPartition_r(char *path, char *devname, Device dev) @@ -245,6 +262,7 @@ VInitPartition_r(char *path, char *devname, Device dev) dp->next = 0; dp->name = (char *)malloc(strlen(path) + 1); strncpy(dp->name, path, strlen(path) + 1); + dp->index = volutil_GetPartitionID(path); #if defined(AFS_NAMEI_ENV) && !defined(AFS_NT40_ENV) /* Create a lockfile for the partition, of the form /vicepa/Lock/vicepa */ dp->devName = (char *)malloc(2 * strlen(path) + 6); @@ -254,7 +272,7 @@ VInitPartition_r(char *path, char *devname, Device dev) mkdir(dp->devName, 0700); strcat(dp->devName, path); close(afs_open(dp->devName, O_RDWR | O_CREAT, 0600)); - dp->device = volutil_GetPartitionID(path); + dp->device = dp->index; #else dp->devName = (char *)malloc(strlen(devname) + 1); strncpy(dp->devName, devname, strlen(devname) + 1); @@ -268,6 +286,11 @@ VInitPartition_r(char *path, char *devname, Device dev) (void)namei_ViceREADME(VPartitionPath(dp)); #endif VSetPartitionDiskUsage_r(dp); +#ifdef AFS_DEMAND_ATTACH_FS + AddPartitionToTable_r(dp); + queue_Init(&dp->vol_list); + assert(pthread_cond_init(&dp->vol_list.cv, NULL) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ } static void @@ -352,7 +375,7 @@ VCheckPartition(char *part, char *devname) return -1; #endif #endif /* AFS_NAMEI_ENV */ -#endif +#endif /* !AFS_LINUX20_ENV && !AFS_NT40_ENV */ #if defined(AFS_DUX40_ENV) && !defined(AFS_NAMEI_ENV) if (status.st_ino != ROOTINO) { @@ -825,10 +848,14 @@ struct DiskPartition * VGetPartition_r(char *name, int abortp) { register struct DiskPartition *dp; +#ifdef AFS_DEMAND_ATTACH_FS + dp = VLookupPartition_r(name); +#else /* AFS_DEMAND_ATTACH_FS */ for (dp = DiskPartitionList; dp; dp = dp->next) { if (strcmp(dp->name, name) == 0) break; } +#endif /* AFS_DEMAND_ATTACH_FS */ if (abortp) assert(dp != NULL); return dp; @@ -1234,3 +1261,60 @@ VUnlockPartition(char *name) VUnlockPartition_r(name); VOL_UNLOCK; } + +#ifdef AFS_DEMAND_ATTACH_FS +/* XXX not sure this will work on AFS_NT40_ENV + * needs to be tested! + */ +struct DiskPartition * +VGetPartitionById_r(afs_int32 id, int abortp) +{ + struct DiskPartition * dp = NULL; + + if ((id >= 0) && (id <= VOLMAXPARTS)) { + dp = DiskPartitionTable[id]; + } + + if (abortp) { + assert(dp != NULL); + } + return dp; +} + +struct DiskPartition * +VGetPartitionById(afs_int32 id, int abortp) +{ + struct Diskpartition * dp; + + VOL_LOCK; + dp = VGetPartitionById_r(id, abortp); + VOL_UNLOCK; + + return dp; +} + +static struct DiskPartition * +VLookupPartition_r(char * path) +{ + afs_int32 id = volutil_GetPartitionID(path); + + if (id < 0 || id > VOLMAXPARTS) + return NULL; + + return DiskPartitionTable[id]; +} + +static void +AddPartitionToTable_r(struct DiskPartition * dp) +{ + assert(dp->index >= 0 && dp->index <= VOLMAXPARTS); + DiskPartitionTable[dp->index] = dp; +} + +static void +DeletePartitionFromTable_r(struct DiskPartition * dp) +{ + assert(dp->index >= 0 && dp->index <= VOLMAXPARTS); + DiskPartitionTable[dp->index] = NULL; +} +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/vol/partition.h b/src/vol/partition.h index 547ec94c18..7d869dfae9 100644 --- a/src/vol/partition.h +++ b/src/vol/partition.h @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -27,6 +29,7 @@ #define AFS_RDSKDEV "/dev/r" #endif + /* All Vice partitions on a server will have the following name prefix */ #define VICE_PARTITION_PREFIX "/vicep" #define VICE_PREFIX_SIZE (sizeof(VICE_PARTITION_PREFIX)-1) @@ -53,6 +56,7 @@ struct DiskPartition { char *name; /* Mounted partition name */ char *devName; /* Device mounted on */ Device device; /* device number */ + afs_int32 index; /* partition index (0<=x<=VOLMAXPARTS) */ int lock_fd; /* File descriptor of this partition if locked; otherwise -1; * Not used by the file server */ int free; /* Total number of blocks (1K) presumed @@ -77,7 +81,26 @@ struct DiskPartition { * from the superblock */ int flags; int f_files; /* total number of files in this partition */ +#ifdef AFS_DEMAND_ATTACH_FS + struct { + struct rx_queue head; /* list of volumes on this partition (VByPList) */ + afs_uint32 len; /* length of volume list */ + int busy; /* asynch vol list op in progress */ + pthread_cond_t cv; /* vol_list.busy change cond var */ + } vol_list; +#endif /* AFS_DEMAND_ATTACH_FS */ }; + +struct DiskPartitionStats { + afs_int32 free; + afs_int32 totalUsable; + afs_int32 minFree; + afs_int32 f_files; +#ifdef AFS_DEMAND_ATTACH_FS + afs_int32 vol_list_len; +#endif +}; + #define PART_DONTUPDATE 1 #define PART_DUPLICATE 2 /* NT - used if we find more than one partition * using the same drive. Will be dumped before @@ -93,7 +116,12 @@ extern int VValidVPTEntry(struct vptab *vptp); struct Volume; /* Potentially forward definition */ extern struct DiskPartition *DiskPartitionList; -extern struct DiskPartition *VGetPartition(); +extern struct DiskPartition *VGetPartition(char * name, int abortp); +extern struct DiskPartition *VGetPartition_r(char * name, int abortp); +#ifdef AFS_DEMAND_ATTACH_FS +extern struct DiskPartition *VGetPartitionById(afs_int32 index, int abortp); +extern struct DiskPartition *VGetPartitionById_r(afs_int32 index, int abortp); +#endif extern int VAttachPartitions(void); extern void VLockPartition(char *name); extern void VLockPartition_r(char *name); @@ -108,3 +136,4 @@ extern void VAdjustDiskUsage(Error * ec, struct Volume *vp, afs_sfsize_t blocks, afs_sfsize_t checkBlocks); extern int VDiskUsage(struct Volume *vp, afs_sfsize_t blocks); extern void VPrintDiskStats(void); +extern int VInitPartitionPackage(void); diff --git a/src/vol/purge.c b/src/vol/purge.c index 01bb22efa3..4b13fcf2bc 100644 --- a/src/vol/purge.c +++ b/src/vol/purge.c @@ -52,11 +52,16 @@ RCSID #include "volume.h" #include "viceinode.h" #include "partition.h" +#include "daemon_com.h" #include "fssync.h" /* forward declarations */ -void PurgeIndex_r(Volume * vp, VnodeClass class); -void PurgeHeader_r(Volume * vp); +static int ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile, + afs_int32 * aoffset); +static void PurgeIndex(Volume * vp, VnodeClass class); +static void PurgeIndex_r(Volume * vp, VnodeClass class); +static void PurgeHeader_r(Volume * vp); +static void PurgeHeader(Volume * vp); void VPurgeVolume_r(Error * ec, Volume * vp) @@ -78,7 +83,7 @@ VPurgeVolume_r(Error * ec, Volume * vp) /* * Call the fileserver to break all call backs for that volume */ - FSYNC_askfs(V_id(vp), tpartp->name, FSYNC_RESTOREVOLUME, 0); + FSYNC_VolOp(V_id(vp), tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL); } void @@ -161,7 +166,7 @@ ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile, return -1; } -void +static void PurgeIndex(Volume * vp, VnodeClass class) { VOL_LOCK; @@ -169,7 +174,7 @@ PurgeIndex(Volume * vp, VnodeClass class) VOL_UNLOCK; } -void +static void PurgeIndex_r(Volume * vp, VnodeClass class) { StreamHandle_t *ifile; @@ -199,7 +204,7 @@ PurgeIndex_r(Volume * vp, VnodeClass class) FDH_CLOSE(fdP); } -void +static void PurgeHeader(Volume * vp) { VOL_LOCK; @@ -207,7 +212,7 @@ PurgeHeader(Volume * vp) VOL_UNLOCK; } -void +static void PurgeHeader_r(Volume * vp) { IH_REALLYCLOSE(V_diskDataHandle(vp)); diff --git a/src/vol/salvage.h b/src/vol/salvage.h index a18a24574c..ce53539070 100644 --- a/src/vol/salvage.h +++ b/src/vol/salvage.h @@ -14,6 +14,9 @@ */ +#ifndef __salvage_h_ +#define __salvage_h_ + #include /* Definition of DirHandle for salvager. Not the same as for the file server */ @@ -24,3 +27,5 @@ typedef struct DirHandle { IHandle_t *dirh_handle; afs_int32 dirh_cacheCheck; } DirHandle; + +#endif /* __salvage_h_ */ diff --git a/src/vol/salvaged.c b/src/vol/salvaged.c new file mode 100644 index 0000000000..d5b318b39e --- /dev/null +++ b/src/vol/salvaged.c @@ -0,0 +1,738 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * demand attach fs + * online salvager daemon + */ + +/* Main program file. Define globals. */ +#define MAIN 1 + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include +#include +#include +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#ifndef ITIMER_REAL +#include +#endif /* ITIMER_REAL */ +#endif +#if defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV) +#define WCOREDUMP(x) (x & 0200) +#endif +#include +#include +#include +#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV) +#if defined(AFS_VFSINCL_ENV) +#include +#ifdef AFS_SUN5_ENV +#include +#else +#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV) +#include +#include +#else +#include +#endif +#endif +#else /* AFS_VFSINCL_ENV */ +#ifdef AFS_OSF_ENV +#include +#else /* AFS_OSF_ENV */ +#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV) +#include +#endif +#endif +#endif /* AFS_VFSINCL_ENV */ +#endif /* AFS_SGI_ENV */ +#ifdef AFS_AIX_ENV +#include +#include +#else +#ifdef AFS_HPUX_ENV +#include +#include +#else +#if defined(AFS_SGI_ENV) +#include +#include +#include +#else +#if defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV) +#ifdef AFS_SUN5_ENV +#include +#include +#include +#else +#include +#endif +#else +#endif /* AFS_SGI_ENV */ +#endif /* AFS_HPUX_ENV */ +#endif +#endif +#include +#ifndef AFS_NT40_ENV +#include +#endif +#include +#include +#include +#include /* signal(), kill(), wait(), etc. */ +#ifndef AFS_NT40_ENV +#include +#endif + +#include "nfs.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include "daemon_com.h" +#include "fssync.h" +#include "salvsync.h" +#include "viceinode.h" +#include "salvage.h" +#include "volinodes.h" /* header magic number, etc. stuff */ +#include "vol-salvage.h" +#ifdef AFS_NT40_ENV +#include +#endif + + +#if !defined(AFS_DEMAND_ATTACH_FS) +#error "online salvager only supported for demand attach fileserver" +#endif /* AFS_DEMAND_ATTACH_FS */ + +#if defined(AFS_NT40_ENV) +#error "online salvager not supported on NT" +#endif /* AFS_NT40_ENV */ + + +/* Forward declarations */ +/*@printflike@*/ void Log(const char *format, ...); +/*@printflike@*/ void Abort(const char *format, ...); + + +/*@+fcnmacros +macrofcndecl@*/ +#ifdef O_LARGEFILE +#define afs_fopen fopen64 +#else /* !O_LARGEFILE */ +#define afs_fopen fopen +#endif /* !O_LARGEFILE */ +/*@=fcnmacros =macrofcndecl@*/ + + + +static volatile int current_workers = 0; +static volatile struct rx_queue pending_q; +static pthread_mutex_t worker_lock; +static pthread_cond_t worker_cv; + +static void * SalvageChildReaperThread(void *); +static int DoSalvageVolume(struct SalvageQueueNode * node, int slot); + +static void SalvageServer(void); +static void SalvageClient(VolumeId vid, char * pname); + +static int Reap_Child(char * prog, int * pid, int * status); + +static void * SalvageLogCleanupThread(void *); +static int SalvageLogCleanup(int pid); + +struct log_cleanup_node { + struct rx_queue q; + int pid; +}; + +struct { + struct rx_queue queue_head; + pthread_cond_t queue_change_cv; +} log_cleanup_queue; + + +#define DEFAULT_PARALLELISM 4 /* allow 4 parallel salvage workers by default */ + +static int +handleit(struct cmd_syndesc *as) +{ + register struct cmd_item *ti; + char pname[100], *temp; + afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0; + struct DiskPartition *partP; + + +#ifdef AFS_SGI_VNODE_GLUE + if (afs_init_kernel_config(-1) < 0) { + printf + ("Can't determine NUMA configuration, not starting salvager.\n"); + exit(1); + } +#endif + + if (as->parms[2].items) /* -debug */ + debug = 1; + if (as->parms[3].items) /* -nowrite */ + Testing = 1; + if (as->parms[4].items) /* -inodes */ + ListInodeOption = 1; + if (as->parms[5].items) /* -oktozap */ + OKToZap = 1; + if (as->parms[6].items) /* -rootinodes */ + ShowRootFiles = 1; + if (as->parms[8].items) /* -ForceReads */ + forceR = 1; + if ((ti = as->parms[9].items)) { /* -Parallel # */ + temp = ti->data; + if (strncmp(temp, "all", 3) == 0) { + PartsPerDisk = 1; + temp += 3; + } + if (strlen(temp) != 0) { + Parallel = atoi(temp); + if (Parallel < 1) + Parallel = 1; + if (Parallel > MAXPARALLEL) { + printf("Setting parallel salvages to maximum of %d \n", + MAXPARALLEL); + Parallel = MAXPARALLEL; + } + } + } else { + Parallel = MIN(DEFAULT_PARALLELISM, MAXPARALLEL); + } + if ((ti = as->parms[10].items)) { /* -tmpdir */ + DIR *dirp; + + tmpdir = ti->data; + dirp = opendir(tmpdir); + if (!dirp) { + printf + ("Can't open temporary placeholder dir %s; using current partition \n", + tmpdir); + tmpdir = NULL; + } else + closedir(dirp); + } + if ((ti = as->parms[11].items)) /* -showlog */ + ShowLog = 1; + if ((ti = as->parms[12].items)) { /* -orphans */ + if (Testing) + orphans = ORPH_IGNORE; + else if (strcmp(ti->data, "remove") == 0 + || strcmp(ti->data, "r") == 0) + orphans = ORPH_REMOVE; + else if (strcmp(ti->data, "attach") == 0 + || strcmp(ti->data, "a") == 0) + orphans = ORPH_ATTACH; + } +#ifndef AFS_NT40_ENV /* ignore options on NT */ + if ((ti = as->parms[13].items)) { /* -syslog */ + useSyslog = 1; + ShowLog = 0; + } + if ((ti = as->parms[14].items)) { /* -syslogfacility */ + useSyslogFacility = atoi(ti->data); + } + + if ((ti = as->parms[15].items)) { /* -datelogs */ + TimeStampLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH); + } +#endif + + if ((ti = as->parms[16].items)) { /* -client */ + if ((ti = as->parms[0].items)) { /* -partition */ + seenpart = 1; + strlcpy(pname, ti->data, sizeof(pname)); + } + if ((ti = as->parms[1].items)) { /* -volumeid */ + seenvol = 1; + vid = atoi(ti->data); + } + + if (!seenpart || !seenvol) { + printf("You must specify '-partition' and '-volumeid' with the '-client' option\n"); + exit(-1); + } + + SalvageClient(vid, pname); + + } else { /* salvageserver mode */ + SalvageServer(); + } + return (0); +} + + +#ifndef AFS_NT40_ENV +#include "AFS_component_version_number.c" +#endif +#define MAX_ARGS 128 +#ifdef AFS_NT40_ENV +char *save_args[MAX_ARGS]; +int n_save_args = 0; +pthread_t main_thread; +#endif + +static char commandLine[150]; + +int +main(int argc, char **argv) +{ + struct cmd_syndesc *ts; + int err = 0; + + int i; + extern char cml_version_number[]; + +#ifdef AFS_AIX32_ENV + /* + * The following signal action for AIX is necessary so that in case of a + * crash (i.e. core is generated) we can include the user's data section + * in the core dump. Unfortunately, by default, only a partial core is + * generated which, in many cases, isn't too useful. + */ + struct sigaction nsa; + + sigemptyset(&nsa.sa_mask); + nsa.sa_handler = SIG_DFL; + nsa.sa_flags = SA_FULLDUMP; + sigaction(SIGABRT, &nsa, NULL); + sigaction(SIGSEGV, &nsa, NULL); +#endif + + /* Initialize directory paths */ + if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) { +#ifdef AFS_NT40_ENV + ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0); +#endif + fprintf(stderr, "%s: Unable to obtain AFS server directory.\n", + argv[0]); + exit(2); + } +#ifdef AFS_NT40_ENV + main_thread = pthread_self(); + if (spawnDatap && spawnDataLen) { + /* This is a child per partition salvager. Don't setup log or + * try to lock the salvager lock. + */ + if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0) + exit(3); + } else { +#endif + for (commandLine[0] = '\0', i = 0; i < argc; i++) { + if (i > 0) + strlcat(commandLine, " ", sizeof(commandLine)); + strlcat(commandLine, argv[i], sizeof(commandLine)); + } + +#ifndef AFS_NT40_ENV + if (geteuid() != 0) { + printf("Salvager must be run as root.\n"); + fflush(stdout); + Exit(0); + } +#endif + + /* bad for normal help flag processing, but can do nada */ + +#ifdef AFS_NT40_ENV + } +#endif + + ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program"); + cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, + "Name of partition to salvage"); + cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL, + "Volume Id to salvage"); + cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL, + "Run in Debugging mode"); + cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL, + "Run readonly/test mode"); + cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL, + "Just list affected afs inodes - debugging flag"); + cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL, + "Give permission to destroy bogus inodes/volumes - debugging flag"); + cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL, + "Show inodes owned by root - debugging flag"); + cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL, + "Force rebuild/salvage of all directories"); + cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL, + "Read smaller blocks to handle IO/bad blocks"); + cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL, + "# of max parallel partition salvaging"); + cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL, + "Name of dir to place tmp files "); + cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL, + "Show log file upon completion"); + cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL, + "ignore | remove | attach"); + + /* note - syslog isn't avail on NT, but if we make it conditional, have + * to deal with screwy offsets for cmd params */ + cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL, + "Write salvage log to syslogs"); + cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL, + "Syslog facility number to use"); + cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL, + "Include timestamp in logfile filename"); + + cmd_AddParm(ts, "-client", CMD_FLAG, CMD_OPTIONAL, + "Use SALVSYNC to ask salvageserver to salvage a volume"); + + err = cmd_Dispatch(argc, argv); + Exit(err); +} + +static void +SalvageClient(VolumeId vid, char * pname) +{ + int done = 0; + afs_int32 code; + SYNC_response res; + SALVSYNC_response_hdr sres; + + VInitVolumePackage(volumeUtility, 5, 5, DONT_CONNECT_FS, 0); + SALVSYNC_clientInit(); + + code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_SALVAGE, SALVSYNC_OPERATOR, 0, NULL); + if (code != SYNC_OK) { + goto sync_error; + } + + res.payload.buf = (void *) &sres; + res.payload.len = sizeof(sres); + + while(!done) { + sleep(2); + code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_QUERY, SALVSYNC_WHATEVER, 0, &res); + if (code != SYNC_OK) { + goto sync_error; + } + switch (sres.state) { + case SALVSYNC_STATE_ERROR: + printf("salvageserver reports salvage ended in an error; check log files for more details\n"); + case SALVSYNC_STATE_DONE: + case SALVSYNC_STATE_UNKNOWN: + done = 1; + } + } + SALVSYNC_clientFinis(); + return; + + sync_error: + if (code == SYNC_DENIED) { + printf("salvageserver refused to salvage volume %u on partition %s\n", + vid, pname); + } else if (code == SYNC_BAD_COMMAND) { + printf("SALVSYNC protocol mismatch; please make sure fileserver, volserver, salvageserver and salvager are same version\n"); + } else if (code == SYNC_COM_ERROR) { + printf("SALVSYNC communications error\n"); + } + SALVSYNC_clientFinis(); + exit(-1); +} + +static int * child_slot; + +static void +SalvageServer(void) +{ + int pid, ret; + struct SalvageQueueNode * node; + pthread_t tid; + pthread_attr_t attrs; + int slot; + + /* All entries to the log will be appended. Useful if there are + * multiple salvagers appending to the log. + */ + + CheckLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH); +#ifndef AFS_NT40_ENV +#ifdef AFS_LINUX20_ENV + fcntl(fileno(logFile), F_SETFL, O_APPEND); /* Isn't this redundant? */ +#else + fcntl(fileno(logFile), F_SETFL, FAPPEND); /* Isn't this redundant? */ +#endif +#endif + setlinebuf(logFile); + + fprintf(logFile, "%s\n", cml_version_number); + Log("Starting OpenAFS Online Salvage Server %s (%s)\n", SalvageVersion, commandLine); + + /* Get and hold a lock for the duration of the salvage to make sure + * that no other salvage runs at the same time. The routine + * VInitVolumePackage (called below) makes sure that a file server or + * other volume utilities don't interfere with the salvage. + */ + + /* even demand attach online salvager + * still needs this because we don't want + * a stand-alone salvager to conflict with + * the salvager daemon */ + ObtainSalvageLock(); + + child_slot = (int *) malloc(Parallel * sizeof(int)); + assert(child_slot != NULL); + memset(child_slot, 0, Parallel * sizeof(int)); + + /* initialize things */ + VInitVolumePackage(salvageServer, 5, 5, + 1, 0); + DInit(10); + queue_Init(&pending_q); + queue_Init(&log_cleanup_queue); + assert(pthread_mutex_init(&worker_lock, NULL) == 0); + assert(pthread_cond_init(&worker_cv, NULL) == 0); + assert(pthread_cond_init(&log_cleanup_queue.queue_change_cv, NULL) == 0); + assert(pthread_attr_init(&attrs) == 0); + + /* start up the reaper and log cleaner threads */ + assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); + assert(pthread_create(&tid, + &attrs, + &SalvageChildReaperThread, + NULL) == 0); + assert(pthread_create(&tid, + &attrs, + &SalvageLogCleanupThread, + NULL) == 0); + + /* loop forever serving requests */ + while (1) { + node = SALVSYNC_getWork(); + assert(node != NULL); + + VOL_LOCK; + /* find a slot */ + for (slot = 0; slot < Parallel; slot++) { + if (!child_slot[slot]) + break; + } + assert (slot < Parallel); + + pid = Fork(); + if (pid == 0) { + VOL_UNLOCK; + ret = DoSalvageVolume(node, slot); + Exit(ret); + } else if (pid < 0) { + VOL_UNLOCK; + SALVSYNC_doneWork(node, 1); + } else { + child_slot[slot] = pid; + node->pid = pid; + VOL_UNLOCK; + + assert(pthread_mutex_lock(&worker_lock) == 0); + current_workers++; + + /* let the reaper thread know another worker was spawned */ + assert(pthread_cond_broadcast(&worker_cv) == 0); + + /* if we're overquota, wait for the reaper */ + while (current_workers >= Parallel) { + assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0); + } + assert(pthread_mutex_unlock(&worker_lock) == 0); + } + } +} + +static int +DoSalvageVolume(struct SalvageQueueNode * node, int slot) +{ + char childLog[AFSDIR_PATH_MAX]; + int ret; + struct DiskPartition * partP; + + VChildProcReconnectFS(); + + /* do not attempt to close parent's logFile handle as + * another thread may have held the lock on the FILE + * structure when fork was called! */ + + afs_snprintf(childLog, sizeof(childLog), "%s.%d", + AFSDIR_SERVER_SLVGLOG_FILEPATH, getpid()); + + logFile = afs_fopen(childLog, "a"); + if (!logFile) { /* still nothing, use stdout */ + logFile = stdout; + ShowLog = 0; + } + + if (node->command.sop.volume <= 0) { + Log("salvageServer: invalid volume id specified; salvage aborted\n"); + return 1; + } + + partP = VGetPartition(node->command.sop.partName, 0); + if (!partP) { + Log("salvageServer: Unknown or unmounted partition %s; salvage aborted\n", + node->command.sop.partName); + return 1; + } + + /* Salvage individual volume; don't notify fs */ + SalvageFileSys1(partP, node->command.sop.volume); + + VDisconnectFS(); + + fclose(logFile); + return 0; +} + + +static void * +SalvageChildReaperThread(void * args) +{ + int slot, pid, status, code, found; + struct SalvageQueueNode *qp, *nqp; + struct log_cleanup_node * cleanup; + + assert(pthread_mutex_lock(&worker_lock) == 0); + + /* loop reaping our children */ + while (1) { + /* wait() won't block unless we have children, so + * block on the cond var if we're childless */ + while (current_workers == 0) { + assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0); + } + + assert(pthread_mutex_unlock(&worker_lock) == 0); + + cleanup = (struct log_cleanup_node *) malloc(sizeof(struct log_cleanup_node)); + + while (Reap_Child("salvageserver", &pid, &status) < 0) { + /* try to prevent livelock if something goes wrong */ + sleep(1); + } + + VOL_LOCK; + for (slot = 0; slot < Parallel; slot++) { + if (child_slot[slot] == pid) + break; + } + assert(slot < Parallel); + child_slot[slot] = 0; + VOL_UNLOCK; + + assert(pthread_mutex_lock(&worker_lock) == 0); + + if (cleanup) { + cleanup->pid = pid; + queue_Append(&log_cleanup_queue, cleanup); + assert(pthread_cond_signal(&log_cleanup_queue.queue_change_cv) == 0); + } + + /* ok, we've reaped a child */ + current_workers--; + SALVSYNC_doneWorkByPid(pid, 0); + assert(pthread_cond_broadcast(&worker_cv) == 0); + } + + return NULL; +} + +static int +Reap_Child(char *prog, int * pid, int * status) +{ + int ret; + ret = wait(status); + + if (ret >= 0) { + *pid = ret; + if (WCOREDUMP(*status)) + Log("\"%s\" core dumped!\n", prog); + if (WIFSIGNALED(*status) != 0 || WEXITSTATUS(*status) != 0) + Log("\"%s\" (pid=%d) terminated abnormally!\n", prog, ret); + } else { + Log("wait returned -1\n"); + } + return ret; +} + +/* + * thread to combine salvager child logs + * back into the main salvageserver log + */ +static void * +SalvageLogCleanupThread(void * arg) +{ + struct log_cleanup_node * cleanup; + + assert(pthread_mutex_lock(&worker_lock) == 0); + + while (1) { + while (queue_IsEmpty(&log_cleanup_queue)) { + assert(pthread_cond_wait(&log_cleanup_queue.queue_change_cv, &worker_lock) == 0); + } + + while (queue_IsNotEmpty(&log_cleanup_queue)) { + cleanup = queue_First(&log_cleanup_queue, log_cleanup_node); + queue_Remove(cleanup); + assert(pthread_mutex_unlock(&worker_lock) == 0); + SalvageLogCleanup(cleanup->pid); + free(cleanup); + assert(pthread_mutex_lock(&worker_lock) == 0); + } + } + + assert(pthread_mutex_unlock(&worker_lock) == 0); + return NULL; +} + +#define LOG_XFER_BUF_SIZE 65536 +static int +SalvageLogCleanup(int pid) +{ + int pidlog, len; + char fn[AFSDIR_PATH_MAX]; + static char buf[LOG_XFER_BUF_SIZE]; + + afs_snprintf(fn, sizeof(fn), "%s.%d", + AFSDIR_SERVER_SLVGLOG_FILEPATH, pid); + + + pidlog = open(fn, O_RDONLY); + unlink(fn); + if (pidlog < 0) + return 1; + + len = read(pidlog, buf, LOG_XFER_BUF_SIZE); + while (len) { + fwrite(buf, len, 1, logFile); + len = read(pidlog, buf, LOG_XFER_BUF_SIZE); + } + + close(pidlog); + + return 0; +} diff --git a/src/vol/salvager.c b/src/vol/salvager.c new file mode 100644 index 0000000000..4af0daa21c --- /dev/null +++ b/src/vol/salvager.c @@ -0,0 +1,499 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * System: VICE-TWO + * Module: salvager.c + * Institution: The Information Technology Center, Carnegie-Mellon University + */ + + +/* Main program file. Define globals. */ +#define MAIN 1 + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#include +#include +#include +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#ifndef ITIMER_REAL +#include +#endif /* ITIMER_REAL */ +#endif +#if defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV) +#define WCOREDUMP(x) (x & 0200) +#endif +#include +#include +#include +#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV) +#if defined(AFS_VFSINCL_ENV) +#include +#ifdef AFS_SUN5_ENV +#include +#else +#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV) +#include +#include +#else +#include +#endif +#endif +#else /* AFS_VFSINCL_ENV */ +#ifdef AFS_OSF_ENV +#include +#else /* AFS_OSF_ENV */ +#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV) +#include +#endif +#endif +#endif /* AFS_VFSINCL_ENV */ +#endif /* AFS_SGI_ENV */ +#ifdef AFS_AIX_ENV +#include +#include +#else +#ifdef AFS_HPUX_ENV +#include +#include +#else +#if defined(AFS_SGI_ENV) +#include +#include +#include +#else +#if defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV) +#ifdef AFS_SUN5_ENV +#include +#include +#include +#else +#include +#endif +#else +#endif /* AFS_SGI_ENV */ +#endif /* AFS_HPUX_ENV */ +#endif +#endif +#include +#ifndef AFS_NT40_ENV +#include +#endif +#include +#include +#include +#include /* signal(), kill(), wait(), etc. */ +#ifndef AFS_NT40_ENV +#include +#endif + +#include "nfs.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include "daemon_com.h" +#include "fssync.h" +#include "salvsync.h" +#include "viceinode.h" +#include "salvage.h" +#include "volinodes.h" /* header magic number, etc. stuff */ +#include "vol-salvage.h" +#ifdef AFS_NT40_ENV +#include +#endif + + +static int get_salvage_lock = 0; + + +/* Forward declarations */ +/*@printflike@*/ void Log(const char *format, ...); +/*@printflike@*/ void Abort(const char *format, ...); + + +static int +handleit(struct cmd_syndesc *as) +{ + register struct cmd_item *ti; + char pname[100], *temp; + afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0; + struct DiskPartition *partP; + +#ifdef AFS_SGI_VNODE_GLUE + if (afs_init_kernel_config(-1) < 0) { + printf + ("Can't determine NUMA configuration, not starting salvager.\n"); + exit(1); + } +#endif + +#ifdef FAST_RESTART + { + afs_int32 i; + for (i = 0; i < CMD_MAXPARMS; i++) { + if (as->parms[i].items) { + seenany = 1; + break; + } + } + } + if (!seenany) { + char *msg = + "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!"; + + if (useSyslog) + Log(msg); + else + printf("%s\n", msg); + + Exit(0); + } +#endif /* FAST_RESTART */ + if ((ti = as->parms[0].items)) { /* -partition */ + seenpart = 1; + strncpy(pname, ti->data, 100); + } + if ((ti = as->parms[1].items)) { /* -volumeid */ + if (!seenpart) { + printf + ("You must also specify '-partition' option with the '-volumeid' option\n"); + exit(-1); + } + seenvol = 1; + vid = atoi(ti->data); + } + if (as->parms[2].items) /* -debug */ + debug = 1; + if (as->parms[3].items) /* -nowrite */ + Testing = 1; + if (as->parms[4].items) /* -inodes */ + ListInodeOption = 1; + if (as->parms[5].items) /* -force */ + ForceSalvage = 1; + if (as->parms[6].items) /* -oktozap */ + OKToZap = 1; + if (as->parms[7].items) /* -rootinodes */ + ShowRootFiles = 1; + if (as->parms[8].items) /* -RebuildDirs */ + RebuildDirs = 1; + if (as->parms[9].items) /* -ForceReads */ + forceR = 1; + if ((ti = as->parms[10].items)) { /* -Parallel # */ + temp = ti->data; + if (strncmp(temp, "all", 3) == 0) { + PartsPerDisk = 1; + temp += 3; + } + if (strlen(temp) != 0) { + Parallel = atoi(temp); + if (Parallel < 1) + Parallel = 1; + if (Parallel > MAXPARALLEL) { + printf("Setting parallel salvages to maximum of %d \n", + MAXPARALLEL); + Parallel = MAXPARALLEL; + } + } + } + if ((ti = as->parms[11].items)) { /* -tmpdir */ + DIR *dirp; + + tmpdir = ti->data; + dirp = opendir(tmpdir); + if (!dirp) { + printf + ("Can't open temporary placeholder dir %s; using current partition \n", + tmpdir); + tmpdir = NULL; + } else + closedir(dirp); + } + if ((ti = as->parms[12].items)) /* -showlog */ + ShowLog = 1; + if ((ti = as->parms[13].items)) { /* -log */ + Testing = 1; + ShowSuid = 1; + Showmode = 1; + } + if ((ti = as->parms[14].items)) { /* -showmounts */ + Testing = 1; + Showmode = 1; + ShowMounts = 1; + } + if ((ti = as->parms[15].items)) { /* -orphans */ + if (Testing) + orphans = ORPH_IGNORE; + else if (strcmp(ti->data, "remove") == 0 + || strcmp(ti->data, "r") == 0) + orphans = ORPH_REMOVE; + else if (strcmp(ti->data, "attach") == 0 + || strcmp(ti->data, "a") == 0) + orphans = ORPH_ATTACH; + } +#ifndef AFS_NT40_ENV /* ignore options on NT */ + if ((ti = as->parms[16].items)) { /* -syslog */ + useSyslog = 1; + ShowLog = 0; + } + if ((ti = as->parms[17].items)) { /* -syslogfacility */ + useSyslogFacility = atoi(ti->data); + } + + if ((ti = as->parms[18].items)) { /* -datelogs */ + TimeStampLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH); + } +#endif + +#ifdef FAST_RESTART + if (ti = as->parms[19].items) { /* -DontSalvage */ + char *msg = + "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!"; + + if (useSyslog) + Log(msg); + else + printf("%s\n", msg); + Exit(0); + } +#elif defined(DEMAND_ATTACH_ENABLE) + if (seenvol && !as->parms[19].items) { + char * msg = + "The standalone salvager cannot be run concurrently with a Demand Attach Fileserver. Please use 'salvageserver -client ' to manually schedule volume salvages with the salvageserver (new versions of 'bos salvage' automatically do this for you). Or, if you insist on using the standalone salvager, add the -forceDAFS flag to your salvager command line."; + + if (useSyslog) + Log(msg); + else + printf("%s\n", msg); + Exit(1); + } +#endif + + if (get_salvage_lock) { + ObtainSalvageLock(); + } + + /* Note: if seenvol we initialize this as a standard volume utility: this has the + * implication that the file server may be running; negotations have to be made with + * the file server in this case to take the read write volume and associated read-only + * volumes off line before salvaging */ +#ifdef AFS_NT40_ENV + if (seenvol) { + if (afs_winsockInit() < 0) { + ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0, + AFSDIR_SALVAGER_FILE, 0); + Log("Failed to initailize winsock, exiting.\n"); + Exit(1); + } + } +#endif + VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5, + DONT_CONNECT_FS, 0); + DInit(10); +#ifdef AFS_NT40_ENV + if (myjob.cj_number != NOT_CHILD) { + if (!seenpart) { + seenpart = 1; + (void)strcpy(pname, myjob.cj_part); + } + } +#endif + if (seenpart == 0) { + for (partP = DiskPartitionList; partP; partP = partP->next) { + SalvageFileSysParallel(partP); + } + SalvageFileSysParallel(0); + } else { + partP = VGetPartition(pname, 0); + if (!partP) { + Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname); + Exit(1); + } + if (!seenvol) + SalvageFileSys(partP, 0); + else { + /* Salvage individual volume */ + if (vid <= 0) { + Log("salvage: invalid volume id specified; salvage aborted\n"); + Exit(1); + } + SalvageFileSys(partP, vid); + } + } + return (0); +} + + +#ifndef AFS_NT40_ENV +#include "AFS_component_version_number.c" +#endif +#define MAX_ARGS 128 +#ifdef AFS_NT40_ENV +char *save_args[MAX_ARGS]; +int n_save_args = 0; +pthread_t main_thread; +#endif + +int +main(int argc, char **argv) +{ + struct cmd_syndesc *ts; + int err = 0; + char commandLine[150]; + + int i; + extern char cml_version_number[]; + +#ifdef AFS_AIX32_ENV + /* + * The following signal action for AIX is necessary so that in case of a + * crash (i.e. core is generated) we can include the user's data section + * in the core dump. Unfortunately, by default, only a partial core is + * generated which, in many cases, isn't too useful. + */ + struct sigaction nsa; + + sigemptyset(&nsa.sa_mask); + nsa.sa_handler = SIG_DFL; + nsa.sa_flags = SA_FULLDUMP; + sigaction(SIGABRT, &nsa, NULL); + sigaction(SIGSEGV, &nsa, NULL); +#endif + + /* Initialize directory paths */ + if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) { +#ifdef AFS_NT40_ENV + ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0); +#endif + fprintf(stderr, "%s: Unable to obtain AFS server directory.\n", + argv[0]); + exit(2); + } +#ifdef AFS_NT40_ENV + main_thread = pthread_self(); + if (spawnDatap && spawnDataLen) { + /* This is a child per partition salvager. Don't setup log or + * try to lock the salvager lock. + */ + if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0) + exit(3); + } else { +#endif + for (commandLine[0] = '\0', i = 0; i < argc; i++) { + if (i > 0) + strcat(commandLine, " "); + strcat(commandLine, argv[i]); + } + + /* All entries to the log will be appended. Useful if there are + * multiple salvagers appending to the log. + */ + + CheckLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH); +#ifndef AFS_NT40_ENV +#ifdef AFS_LINUX20_ENV + fcntl(fileno(logFile), F_SETFL, O_APPEND); /* Isn't this redundant? */ +#else + fcntl(fileno(logFile), F_SETFL, FAPPEND); /* Isn't this redundant? */ +#endif +#endif + setlinebuf(logFile); + +#ifndef AFS_NT40_ENV + if (geteuid() != 0) { + printf("Salvager must be run as root.\n"); + fflush(stdout); + Exit(0); + } +#endif + + /* bad for normal help flag processing, but can do nada */ + + fprintf(logFile, "%s\n", cml_version_number); + Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine); + + /* Get and hold a lock for the duration of the salvage to make sure + * that no other salvage runs at the same time. The routine + * VInitVolumePackage (called below) makes sure that a file server or + * other volume utilities don't interfere with the salvage. + */ + get_salvage_lock = 1; +#ifdef AFS_NT40_ENV + } +#endif + + ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program"); + cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, + "Name of partition to salvage"); + cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL, + "Volume Id to salvage"); + cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL, + "Run in Debugging mode"); + cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL, + "Run readonly/test mode"); + cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL, + "Just list affected afs inodes - debugging flag"); + cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging"); + cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL, + "Give permission to destroy bogus inodes/volumes - debugging flag"); + cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL, + "Show inodes owned by root - debugging flag"); + cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL, + "Force rebuild/salvage of all directories"); + cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL, + "Read smaller blocks to handle IO/bad blocks"); + cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL, + "# of max parallel partition salvaging"); + cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL, + "Name of dir to place tmp files "); + cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL, + "Show log file upon completion"); + cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL, + "Report on suid/sgid files"); + cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL, + "Report on mountpoints"); + cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL, + "ignore | remove | attach"); + + /* note - syslog isn't avail on NT, but if we make it conditional, have + * to deal with screwy offsets for cmd params */ + cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL, + "Write salvage log to syslogs"); + cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL, + "Syslog facility number to use"); + cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL, + "Include timestamp in logfile filename"); +#ifdef FAST_RESTART + cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL, + "Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline"); +#elif defined(DEMAND_ATTACH_ENABLE) + cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL, + "For Demand Attach Fileserver, permit a manual volume salvage outside of the salvageserver"); +#endif /* FAST_RESTART */ + err = cmd_Dispatch(argc, argv); + Exit(err); +} + diff --git a/src/vol/salvsync-client.c b/src/vol/salvsync-client.c new file mode 100644 index 0000000000..7ed96d6ee0 --- /dev/null +++ b/src/vol/salvsync-client.c @@ -0,0 +1,172 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * salvsync-client.c + * + * OpenAFS demand attach fileserver + * Salvage server synchronization with fileserver. + */ + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#include +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + + +#include +#include +#include "nfs.h" +#include +#include "salvsync.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include + +/*@printflike@*/ extern void Log(const char *format, ...); + +#ifdef osi_Assert +#undef osi_Assert +#endif +#define osi_Assert(e) (void)(e) + + +#ifdef AFS_DEMAND_ATTACH_FS +/* + * SALVSYNC is a feature specific to the demand attach fileserver + */ + +extern int LogLevel; +extern int VInit; +extern pthread_mutex_t vol_salvsync_mutex; + +static SYNC_client_state salvsync_client_state = { -1, 2041, SALVSYNC_PROTO_VERSION, 5, 120 }; + +/* + * client-side routines + */ + +int +SALVSYNC_clientInit(void) +{ + return SYNC_connect(&salvsync_client_state); +} + +int +SALVSYNC_clientFinis(void) +{ + SYNC_closeChannel(&salvsync_client_state); + return 1; +} + +int +SALVSYNC_clientReconnect(void) +{ + return SYNC_reconnect(&salvsync_client_state); +} + +afs_int32 +SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res) +{ + afs_int32 code; + + VSALVSYNC_LOCK; + code = SYNC_ask(&salvsync_client_state, com, res); + VSALVSYNC_UNLOCK; + + switch (code) { + case SYNC_OK: + case SYNC_FAILED: + break; + case SYNC_COM_ERROR: + case SYNC_BAD_COMMAND: + Log("SALVSYNC_askSalv: fatal SALVSYNC protocol error; online salvager functionality disabled until next fileserver restart\n"); + break; + case SYNC_DENIED: + Log("SALVSYNC_askSalv: SALVSYNC request denied for reason=%d\n", res->hdr.reason); + break; + default: + Log("SALVSYNC_askSalv: unknown protocol response %d\n", code); + break; + } + + return code; +} + +afs_int32 +SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int command, int reason, + afs_uint32 prio, SYNC_response * res_in) +{ + SYNC_command com; + SYNC_response res_l, *res; + SALVSYNC_command_hdr scom; + SALVSYNC_response_hdr sres; + int n, tot; + + memset(&com, 0, sizeof(com)); + memset(&scom, 0, sizeof(scom)); + + if (res_in) { + res = res_in; + } else { + memset(&res_l, 0, sizeof(res_l)); + memset(&sres, 0, sizeof(sres)); + res_l.payload.buf = (void *) &sres; + res_l.payload.len = sizeof(sres); + res = &res_l; + } + + com.payload.buf = (void *) &scom; + com.payload.len = sizeof(scom); + com.hdr.command = command; + com.hdr.reason = reason; + com.hdr.command_len = sizeof(com.hdr) + sizeof(scom); + scom.volume = volume; + scom.prio = prio; + + if (partName) { + strlcpy(scom.partName, partName, sizeof(scom.partName)); + } else { + scom.partName[0] = '\0'; + } + + return SALVSYNC_askSalv(&com, res); +} + +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/vol/salvsync-server.c b/src/vol/salvsync-server.c new file mode 100644 index 0000000000..d9e083b23f --- /dev/null +++ b/src/vol/salvsync-server.c @@ -0,0 +1,1009 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * salvsync-server.c + * + * OpenAFS demand attach fileserver + * Salvage server synchronization with fileserver. + */ + +/* This controls the size of an fd_set; it must be defined early before + * the system headers define that type and the macros that operate on it. + * Its value should be as large as the maximum file descriptor limit we + * are likely to run into on any platform. Right now, that is 65536 + * which is the default hard fd limit on Solaris 9 */ +#ifndef _WIN32 +#define FD_SETSIZE 65536 +#endif + +#include +#include + +RCSID + ("$Header$"); + +#include +#include +#ifdef AFS_NT40_ENV +#include +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#include +#include + +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H +#include +#endif +#endif + + +#include +#include +#include "nfs.h" +#include +#include "salvsync.h" +#include "lwp.h" +#include "lock.h" +#include +#include "ihandle.h" +#include "vnode.h" +#include "volume.h" +#include "partition.h" +#include + +#if !defined(offsetof) +#include +#endif + +/*@printflike@*/ extern void Log(const char *format, ...); + +#ifdef osi_Assert +#undef osi_Assert +#endif +#define osi_Assert(e) (void)(e) + +#define MAXHANDLERS 4 /* Up to 4 clients; must be at least 2, so that + * move = dump+restore can run on single server */ + +#define MAX_BIND_TRIES 5 /* Number of times to retry socket bind */ + + + +/* Forward declarations */ +static void * SALVSYNC_syncThread(void *); +static void SALVSYNC_newconnection(int fd); +static void SALVSYNC_com(int fd); +static void SALVSYNC_Drop(int fd); +static void AcceptOn(void); +static void AcceptOff(void); +static void InitHandler(void); +static void CallHandler(fd_set * fdsetp); +static int AddHandler(int afd, void (*aproc) (int)); +static int FindHandler(register int afd); +static int FindHandler_r(register int afd); +static int RemoveHandler(register int afd); +static void GetHandler(fd_set * fdsetp, int *maxfdp); + + +/* + * This lock controls access to the handler array. + */ +struct Lock SALVSYNC_handler_lock; + + +#ifdef AFS_DEMAND_ATTACH_FS +/* + * SALVSYNC is a feature specific to the demand attach fileserver + */ + +static int AddToSalvageQueue(struct SalvageQueueNode * node); +static void DeleteFromSalvageQueue(struct SalvageQueueNode * node); +static void AddToPendingQueue(struct SalvageQueueNode * node); +static void DeleteFromPendingQueue(struct SalvageQueueNode * node); +static struct SalvageQueueNode * LookupPendingCommand(SALVSYNC_command_hdr * qry); +static struct SalvageQueueNode * LookupPendingCommandByPid(int pid); +static void RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com); + +static struct SalvageQueueNode * LookupNode(VolumeId vid, char * partName); +static struct SalvageQueueNode * LookupNodeByCommand(SALVSYNC_command_hdr * qry); +static void AddNodeToHash(struct SalvageQueueNode * node); +static void DeleteNodeFromHash(struct SalvageQueueNode * node); + +static afs_int32 SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res); +static afs_int32 SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res); +static afs_int32 SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res); +static afs_int32 SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res); +static afs_int32 SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res); + + +extern int LogLevel; +extern int VInit; +extern pthread_mutex_t vol_salvsync_mutex; + +static int AcceptSd = -1; /* Socket used by server for accepting connections */ + + +/* be careful about rearranging elements in this structure. + * element placement has been optimized for locality of reference + * in SALVSYNC_getWork() */ +struct SalvageQueue { + volatile int total_len; + volatile afs_int32 last_insert; /* id of last partition to have a salvage node insert */ + volatile int len[VOLMAXPARTS+1]; + volatile struct rx_queue part[VOLMAXPARTS+1]; + pthread_cond_t cv; +}; +static struct SalvageQueue salvageQueue; /* volumes waiting to be salvaged */ + +struct QueueHead { + volatile struct rx_queue q; + volatile int len; + pthread_cond_t queue_change_cv; +}; +static struct QueueHead pendingQueue; /* volumes being salvaged */ + +/* XXX + * whether a partition has a salvage in progress + * + * the salvager code only permits one salvage per partition at a time + * + * the following hack tries to keep salvaged parallelism high by + * only permitting one salvage dispatch per partition at a time + * + * unfortunately, the parallel salvager currently + * has a rather braindead routine that won't permit + * multiple salvages on the same "device". this + * function happens to break pretty badly on lvm, raid luns, etc. + * + * this hack isn't good enough to stop the device limiting code from + * crippling performance. someday that code needs to be rewritten + */ +static int partition_salvaging[VOLMAXPARTS+1]; + +#define VSHASH_SIZE 64 +#define VSHASH_MASK (VSHASH_SIZE-1) +#define VSHASH(vid) ((vid)&VSHASH_MASK) + +static struct QueueHead SalvageHashTable[VSHASH_SIZE]; + +static struct SalvageQueueNode * +LookupNode(afs_uint32 vid, char * partName) +{ + struct rx_queue *qp, *nqp; + struct SalvageQueueNode *vsp; + int idx = VSHASH(vid); + + for (queue_Scan(&SalvageHashTable[idx], qp, nqp, rx_queue)) { + vsp = (struct SalvageQueueNode *)((char *)qp - offsetof(struct SalvageQueueNode, hash_chain)); + if ((vsp->command.sop.volume == vid) && + !strncmp(vsp->command.sop.partName, partName, sizeof(vsp->command.sop.partName))) { + break; + } + } + + if (queue_IsEnd(&SalvageHashTable[idx], qp)) { + vsp = NULL; + } + return vsp; +} + +static struct SalvageQueueNode * +LookupNodeByCommand(SALVSYNC_command_hdr * qry) +{ + return LookupNode(qry->volume, qry->partName); +} + +static void +AddNodeToHash(struct SalvageQueueNode * node) +{ + int idx = VSHASH(node->command.sop.volume); + + if (queue_IsOnQueue(&node->hash_chain)) { + return; + } + + queue_Append(&SalvageHashTable[idx], &node->hash_chain); + SalvageHashTable[idx].len++; +} + +static void +DeleteNodeFromHash(struct SalvageQueueNode * node) +{ + int idx = VSHASH(node->command.sop.volume); + + if (queue_IsNotOnQueue(&node->hash_chain)) { + return; + } + + queue_Remove(&node->hash_chain); + SalvageHashTable[idx].len--; +} + +void +SALVSYNC_salvInit(void) +{ + int i; + pthread_t tid; + pthread_attr_t tattr; + + /* initialize the queues */ + assert(pthread_cond_init(&salvageQueue.cv, NULL) == 0); + for (i = 0; i <= VOLMAXPARTS; i++) { + queue_Init(&salvageQueue.part[i]); + salvageQueue.len[i] = 0; + } + assert(pthread_cond_init(&pendingQueue.queue_change_cv, NULL) == 0); + queue_Init(&pendingQueue); + salvageQueue.total_len = pendingQueue.len = 0; + salvageQueue.last_insert = -1; + memset(partition_salvaging, 0, sizeof(partition_salvaging)); + + for (i = 0; i < VSHASH_SIZE; i++) { + assert(pthread_cond_init(&SalvageHashTable[i].queue_change_cv, NULL) == 0); + SalvageHashTable[i].len = 0; + queue_Init(&SalvageHashTable[i]); + } + + /* start the salvsync thread */ + assert(pthread_attr_init(&tattr) == 0); + assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0); + assert(pthread_create(&tid, &tattr, SALVSYNC_syncThread, NULL) == 0); +} + +static int +getport(struct sockaddr_in *addr) +{ + int sd; + + memset(addr, 0, sizeof(*addr)); + assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); +#ifdef STRUCT_SOCKADDR_HAS_SA_LEN + addr->sin_len = sizeof(struct sockaddr_in); +#endif + addr->sin_addr.s_addr = htonl(0x7f000001); + addr->sin_family = AF_INET; /* was localhost->h_addrtype */ + addr->sin_port = htons(2041); /* XXXX htons not _really_ neccessary */ + + return sd; +} + +static fd_set SALVSYNC_readfds; + +static void * +SALVSYNC_syncThread(void * args) +{ + struct sockaddr_in addr; + int on = 1; + int code; + int numTries; + int tid; + +#ifndef AFS_NT40_ENV + (void)signal(SIGPIPE, SIG_IGN); +#endif + + /* set our 'thread-id' so that the host hold table works */ + MUTEX_ENTER(&rx_stats_mutex); /* protects rxi_pthread_hinum */ + tid = ++rxi_pthread_hinum; + MUTEX_EXIT(&rx_stats_mutex); + pthread_setspecific(rx_thread_id_key, (void *)tid); + Log("Set thread id %d for SALVSYNC_syncThread\n", tid); + + AcceptSd = getport(&addr); + /* Reuseaddr needed because system inexplicably leaves crud lying around */ + code = + setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on, + sizeof(on)); + if (code) + Log("SALVSYNC_sync: setsockopt failed with (%d)\n", errno); + + for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) { + if ((code = + bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0) + break; + Log("SALVSYNC_sync: bind failed with (%d), will sleep and retry\n", + errno); + sleep(5); + } + assert(!code); + listen(AcceptSd, 100); + InitHandler(); + AcceptOn(); + + for (;;) { + int maxfd; + GetHandler(&SALVSYNC_readfds, &maxfd); + /* Note: check for >= 1 below is essential since IOMGR_select + * doesn't have exactly same semantics as select. + */ + if (select(maxfd + 1, &SALVSYNC_readfds, NULL, NULL, NULL) >= 1) + CallHandler(&SALVSYNC_readfds); + } + + return NULL; +} + +static void +SALVSYNC_newconnection(int afd) +{ + struct sockaddr_in other; + int junk, fd; + junk = sizeof(other); + fd = accept(afd, (struct sockaddr *)&other, &junk); + if (fd == -1) { + Log("SALVSYNC_newconnection: accept failed, errno==%d\n", errno); + assert(1 == 2); + } else if (!AddHandler(fd, SALVSYNC_com)) { + AcceptOff(); + assert(AddHandler(fd, SALVSYNC_com)); + } +} + +/* this function processes commands from an salvsync file descriptor (fd) */ +static afs_int32 SALV_cnt = 0; +static void +SALVSYNC_com(int fd) +{ + SYNC_command com; + SYNC_response res; + SALVSYNC_response_hdr sres_hdr; + SALVSYNC_command scom; + SALVSYNC_response sres; + SYNC_PROTO_BUF_DECL(buf); + + com.payload.buf = (void *)buf; + com.payload.len = SYNC_PROTO_MAX_LEN; + res.payload.buf = (void *) &sres_hdr; + res.payload.len = sizeof(sres_hdr); + res.hdr.response_len = sizeof(res.hdr) + sizeof(sres_hdr); + res.hdr.proto_version = SALVSYNC_PROTO_VERSION; + + scom.hdr = &com.hdr; + scom.sop = (SALVSYNC_command_hdr *) buf; + scom.com = &com; + sres.hdr = &res.hdr; + sres.sop = &sres_hdr; + sres.res = &res; + + SALV_cnt++; + if (SYNC_getCom(fd, &com)) { + Log("SALVSYNC_com: read failed; dropping connection (cnt=%d)\n", SALV_cnt); + SALVSYNC_Drop(fd); + return; + } + + if (com.hdr.proto_version != SALVSYNC_PROTO_VERSION) { + Log("SALVSYNC_com: invalid protocol version (%u)\n", com.hdr.proto_version); + res.hdr.response = SYNC_COM_ERROR; + res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + goto respond; + } + + if (com.recv_len != (sizeof(com.hdr) + sizeof(SALVSYNC_command_hdr))) { + Log("SALVSYNC_com: invalid protocol message length (%u)\n", com.recv_len); + res.hdr.response = SYNC_COM_ERROR; + res.hdr.reason = SYNC_REASON_MALFORMED_PACKET; + res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + goto respond; + } + + VOL_LOCK; + switch (com.hdr.command) { + case SALVSYNC_NOP: + break; + case SALVSYNC_SALVAGE: + res.hdr.response = SALVSYNC_com_Salvage(&scom, &sres); + break; + case SALVSYNC_CANCEL: + /* cancel a salvage */ + res.hdr.response = SALVSYNC_com_Cancel(&scom, &sres); + break; + case SALVSYNC_CANCELALL: + /* cancel all queued salvages */ + res.hdr.response = SALVSYNC_com_CancelAll(&scom, &sres); + break; + case SALVSYNC_RAISEPRIO: + /* raise the priority of a salvage */ + res.hdr.response = SALVSYNC_com_RaisePrio(&scom, &sres); + break; + case SALVSYNC_QUERY: + /* query whether a volume is done salvaging */ + res.hdr.response = SALVSYNC_com_Query(&scom, &sres); + break; + case SYNC_COM_CHANNEL_CLOSE: + res.hdr.response = SYNC_OK; + res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN; + break; + default: + res.hdr.response = SYNC_BAD_COMMAND; + break; + } + + sres_hdr.sq_len = salvageQueue.total_len; + sres_hdr.pq_len = pendingQueue.len; + VOL_UNLOCK; + + respond: + SYNC_putRes(fd, &res); + if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) { + SALVSYNC_Drop(fd); + } +} + +static afs_int32 +SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct SalvageQueueNode * node; + + if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) { + code = SYNC_FAILED; + res->hdr->reason = SYNC_REASON_MALFORMED_PACKET; + goto done; + } + + node = LookupNodeByCommand(com->sop); + + /* schedule a salvage for this volume */ + if (node != NULL) { + switch (node->state) { + case SALVSYNC_STATE_ERROR: + case SALVSYNC_STATE_DONE: + memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr)); + memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr)); + node->command.sop.prio = 0; + if (AddToSalvageQueue(node)) { + code = SYNC_DENIED; + } + break; + default: + break; + } + } else { + node = (struct SalvageQueueNode *) malloc(sizeof(struct SalvageQueueNode)); + if (node == NULL) { + code = SYNC_DENIED; + goto done; + } + memset(node, 0, sizeof(struct SalvageQueueNode)); + memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr)); + memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr)); + AddNodeToHash(node); + if (AddToSalvageQueue(node)) { + /* roll back */ + DeleteNodeFromHash(node); + free(node); + node = NULL; + code = SYNC_DENIED; + goto done; + } + } + + res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID; + res->sop->state = node->state; + res->sop->prio = node->command.sop.prio; + + done: + return code; +} + +static afs_int32 +SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct SalvageQueueNode * node; + + if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) { + code = SYNC_FAILED; + res->hdr->reason = SYNC_REASON_MALFORMED_PACKET; + goto done; + } + + node = LookupNodeByCommand(com->sop); + + if (node == NULL) { + res->sop->state = SALVSYNC_STATE_UNKNOWN; + res->sop->prio = 0; + } else { + res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID; + res->sop->prio = node->command.sop.prio; + res->sop->state = node->state; + if (node->state == SALVSYNC_STATE_QUEUED) { + DeleteFromSalvageQueue(node); + } + } + + done: + return code; +} + +static afs_int32 +SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res) +{ + struct SalvageQueueNode * np, *nnp; + struct DiskPartition * dp; + + for (dp = DiskPartitionList ; dp ; dp = dp->next) { + for (queue_Scan(&salvageQueue.part[dp->index], np, nnp, SalvageQueueNode)) { + DeleteFromSalvageQueue(np); + } + } + + return SYNC_OK; +} + +static afs_int32 +SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct SalvageQueueNode * node; + + if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) { + code = SYNC_FAILED; + res->hdr->reason = SYNC_REASON_MALFORMED_PACKET; + goto done; + } + + node = LookupNodeByCommand(com->sop); + + /* raise the priority of a salvage */ + if (node == NULL) { + code = SALVSYNC_com_Salvage(com, res); + node = LookupNodeByCommand(com->sop); + } else { + switch (node->state) { + case SALVSYNC_STATE_QUEUED: + RaiseCommandPrio(node, com->sop); + break; + case SALVSYNC_STATE_SALVAGING: + break; + case SALVSYNC_STATE_ERROR: + case SALVSYNC_STATE_DONE: + code = SALVSYNC_com_Salvage(com, res); + break; + default: + break; + } + } + + if (node == NULL) { + res->sop->prio = 0; + res->sop->state = SALVSYNC_STATE_UNKNOWN; + } else { + res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID; + res->sop->prio = node->command.sop.prio; + res->sop->state = node->state; + } + + done: + return code; +} + +static afs_int32 +SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res) +{ + afs_int32 code = SYNC_OK; + struct SalvageQueueNode * node; + + if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) { + code = SYNC_FAILED; + res->hdr->reason = SYNC_REASON_MALFORMED_PACKET; + goto done; + } + + node = LookupNodeByCommand(com->sop); + + /* query whether a volume is done salvaging */ + if (node == NULL) { + res->sop->state = SALVSYNC_STATE_UNKNOWN; + res->sop->prio = 0; + } else { + res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID; + res->sop->state = node->state; + res->sop->prio = node->command.sop.prio; + } + + done: + return code; +} + +static void +SALVSYNC_Drop(int fd) +{ + RemoveHandler(fd); +#ifdef AFS_NT40_ENV + closesocket(fd); +#else + close(fd); +#endif + AcceptOn(); +} + +static int AcceptHandler = -1; /* handler id for accept, if turned on */ + +static void +AcceptOn(void) +{ + if (AcceptHandler == -1) { + assert(AddHandler(AcceptSd, SALVSYNC_newconnection)); + AcceptHandler = FindHandler(AcceptSd); + } +} + +static void +AcceptOff(void) +{ + if (AcceptHandler != -1) { + assert(RemoveHandler(AcceptSd)); + AcceptHandler = -1; + } +} + +/* The multiple FD handling code. */ + +static int HandlerFD[MAXHANDLERS]; +static void (*HandlerProc[MAXHANDLERS]) (int); + +static void +InitHandler(void) +{ + register int i; + ObtainWriteLock(&SALVSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) { + HandlerFD[i] = -1; + HandlerProc[i] = NULL; + } + ReleaseWriteLock(&SALVSYNC_handler_lock); +} + +static void +CallHandler(fd_set * fdsetp) +{ + register int i; + ObtainReadLock(&SALVSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) { + if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) { + ReleaseReadLock(&SALVSYNC_handler_lock); + (*HandlerProc[i]) (HandlerFD[i]); + ObtainReadLock(&SALVSYNC_handler_lock); + } + } + ReleaseReadLock(&SALVSYNC_handler_lock); +} + +static int +AddHandler(int afd, void (*aproc) (int)) +{ + register int i; + ObtainWriteLock(&SALVSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == -1) + break; + if (i >= MAXHANDLERS) { + ReleaseWriteLock(&SALVSYNC_handler_lock); + return 0; + } + HandlerFD[i] = afd; + HandlerProc[i] = aproc; + ReleaseWriteLock(&SALVSYNC_handler_lock); + return 1; +} + +static int +FindHandler(register int afd) +{ + register int i; + ObtainReadLock(&SALVSYNC_handler_lock); + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == afd) { + ReleaseReadLock(&SALVSYNC_handler_lock); + return i; + } + ReleaseReadLock(&SALVSYNC_handler_lock); /* just in case */ + assert(1 == 2); + return -1; /* satisfy compiler */ +} + +static int +FindHandler_r(register int afd) +{ + register int i; + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] == afd) { + return i; + } + assert(1 == 2); + return -1; /* satisfy compiler */ +} + +static int +RemoveHandler(register int afd) +{ + ObtainWriteLock(&SALVSYNC_handler_lock); + HandlerFD[FindHandler_r(afd)] = -1; + ReleaseWriteLock(&SALVSYNC_handler_lock); + return 1; +} + +static void +GetHandler(fd_set * fdsetp, int *maxfdp) +{ + register int i; + register int maxfd = -1; + FD_ZERO(fdsetp); + ObtainReadLock(&SALVSYNC_handler_lock); /* just in case */ + for (i = 0; i < MAXHANDLERS; i++) + if (HandlerFD[i] != -1) { + FD_SET(HandlerFD[i], fdsetp); + if (maxfd < HandlerFD[i]) + maxfd = HandlerFD[i]; + } + *maxfdp = maxfd; + ReleaseReadLock(&SALVSYNC_handler_lock); /* just in case */ +} + +static int +AddToSalvageQueue(struct SalvageQueueNode * node) +{ + afs_int32 id; + + id = volutil_GetPartitionID(node->command.sop.partName); + if (id < 0 || id > VOLMAXPARTS) { + return 1; + } + if (!VGetPartitionById_r(id, 0)) { + /* don't enqueue salvage requests for unmounted partitions */ + return 1; + } + queue_Append(&salvageQueue.part[id], node); + salvageQueue.len[id]++; + salvageQueue.total_len++; + salvageQueue.last_insert = id; + node->partition_id = id; + node->state = SALVSYNC_STATE_QUEUED; + assert(pthread_cond_broadcast(&salvageQueue.cv) == 0); + return 0; +} + +static void +DeleteFromSalvageQueue(struct SalvageQueueNode * node) +{ + if (queue_IsOnQueue(node)) { + queue_Remove(node); + salvageQueue.len[node->partition_id]--; + salvageQueue.total_len--; + node->state = SALVSYNC_STATE_UNKNOWN; + assert(pthread_cond_broadcast(&salvageQueue.cv) == 0); + } +} + +static void +AddToPendingQueue(struct SalvageQueueNode * node) +{ + queue_Append(&pendingQueue, node); + pendingQueue.len++; + node->state = SALVSYNC_STATE_SALVAGING; + assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0); +} + +static void +DeleteFromPendingQueue(struct SalvageQueueNode * node) +{ + if (queue_IsOnQueue(node)) { + queue_Remove(node); + pendingQueue.len--; + node->state = SALVSYNC_STATE_UNKNOWN; + assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0); + } +} + +static struct SalvageQueueNode * +LookupPendingCommand(SALVSYNC_command_hdr * qry) +{ + struct SalvageQueueNode * np, * nnp; + + for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) { + if ((np->command.sop.volume == qry->volume) && + !strncmp(np->command.sop.partName, qry->partName, + sizeof(qry->partName))) + break; + } + + if (queue_IsEnd(&pendingQueue, np)) + np = NULL; + return np; +} + +static struct SalvageQueueNode * +LookupPendingCommandByPid(int pid) +{ + struct SalvageQueueNode * np, * nnp; + + for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) { + if (np->pid == pid) + break; + } + + if (queue_IsEnd(&pendingQueue, np)) + np = NULL; + return np; +} + + +/* raise the priority of a previously scheduled salvage */ +static void +RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com) +{ + struct SalvageQueueNode *np, *nnp; + afs_int32 id; + + assert(queue_IsOnQueue(node)); + + node->command.sop.prio = com->prio; + id = node->partition_id; + if (queue_First(&salvageQueue.part[id], SalvageQueueNode)->command.sop.prio < com->prio) { + queue_Remove(node); + queue_Prepend(&salvageQueue.part[id], node); + } else { + for (queue_ScanBackwardsFrom(&salvageQueue.part[id], node, np, nnp, SalvageQueueNode)) { + if (np->command.sop.prio > com->prio) + break; + } + if (queue_IsEnd(&salvageQueue.part[id], np)) { + queue_Remove(node); + queue_Prepend(&salvageQueue.part[id], node); + } else if (node != np) { + queue_Remove(node); + queue_InsertAfter(np, node); + } + } +} + +/* this will need to be rearchitected if we ever want more than one thread + * to wait for new salvage nodes */ +struct SalvageQueueNode * +SALVSYNC_getWork(void) +{ + int i, ret; + struct DiskPartition * dp = NULL, * fdp; + static afs_int32 next_part_sched = 0; + struct SalvageQueueNode *node = NULL, *np; + + VOL_LOCK; + + /* + * wait for work to be scheduled + * if there are no disk partitions, just sit in this wait loop forever + */ + while (!salvageQueue.total_len || !DiskPartitionList) { + assert(pthread_cond_wait(&salvageQueue.cv, &vol_glock_mutex) == 0); + } + + + /* + * short circuit for simple case where only one partition has + * scheduled salvages + */ + if (salvageQueue.last_insert >= 0 && salvageQueue.last_insert <= VOLMAXPARTS && + (salvageQueue.total_len == salvageQueue.len[salvageQueue.last_insert])) { + node = queue_First(&salvageQueue.part[salvageQueue.last_insert], SalvageQueueNode); + goto have_node; + } + + + /* + * ok, more than one partition has scheduled salvages. + * now search for partitions with scheduled salvages, but no pending salvages. + */ + dp = VGetPartitionById_r(next_part_sched, 0); + if (!dp) { + dp = DiskPartitionList; + } + fdp = dp; + + for (i=0 ; + !i || dp != fdp ; + dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) { + if (!partition_salvaging[dp->index] && salvageQueue.len[dp->index]) { + node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode); + goto have_node; + } + } + + + /* + * all partitions with scheduled salvages have at least one pending. + * now do an exhaustive search for a scheduled salvage. + */ + dp = fdp; + + for (i=0 ; + !i || dp != fdp ; + dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) { + if (salvageQueue.len[dp->index]) { + node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode); + goto have_node; + } + } + + /* we should never reach this line */ + assert(1==2); + + have_node: + assert(node != NULL); + node->pid = 0; + partition_salvaging[node->partition_id]++; + DeleteFromSalvageQueue(node); + AddToPendingQueue(node); + + if (dp) { + /* update next_part_sched field */ + if (dp->next) { + next_part_sched = dp->next->index; + } else if (DiskPartitionList) { + next_part_sched = DiskPartitionList->index; + } else { + next_part_sched = -1; + } + } + + bail: + VOL_UNLOCK; + return node; +} + +static void +SALVSYNC_doneWork_r(struct SalvageQueueNode * node, int result) +{ + afs_int32 partid; + DeleteFromPendingQueue(node); + partid = node->partition_id; + if (partid >=0 && partid <= VOLMAXPARTS) { + partition_salvaging[partid]--; + } + if (result == 0) { + node->state = SALVSYNC_STATE_DONE; + } else { + node->state = SALVSYNC_STATE_ERROR; + } +} + +void +SALVSYNC_doneWork(struct SalvageQueueNode * node, int result) +{ + VOL_LOCK; + SALVSYNC_doneWork_r(node, result); + VOL_UNLOCK; +} + +void +SALVSYNC_doneWorkByPid(int pid, int result) +{ + struct SalvageQueueNode * node; + + VOL_LOCK; + node = LookupPendingCommandByPid(pid); + if (node != NULL) { + SALVSYNC_doneWork_r(node, result); + } + VOL_UNLOCK; +} + +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/vol/salvsync.h b/src/vol/salvsync.h new file mode 100644 index 0000000000..6611df6589 --- /dev/null +++ b/src/vol/salvsync.h @@ -0,0 +1,111 @@ +/* + * Copyright 2006, Sine Nomine Associates and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * demand attach fs + * salvage server interface + */ +#ifndef _AFS_VOL_SALVSYNC_H +#define _AFS_VOL_SALVSYNC_H + +#ifdef AFS_DEMAND_ATTACH_FS +#include "daemon_com.h" + + +#define SALVSYNC_PROTO_VERSION 1 + + +/* SALVSYNC command codes */ +#define SALVSYNC_NOP SYNC_COM_CODE_DECL(0) /* just return stats */ +#define SALVSYNC_SALVAGE SYNC_COM_CODE_DECL(1) /* schedule a salvage */ +#define SALVSYNC_CANCEL SYNC_COM_CODE_DECL(2) /* Cancel a salvage */ +#define SALVSYNC_RAISEPRIO SYNC_COM_CODE_DECL(3) /* move a salvage operation to + * the head of the work queue */ +#define SALVSYNC_QUERY SYNC_COM_CODE_DECL(4) /* query the status of a salvage */ +#define SALVSYNC_CANCELALL SYNC_COM_CODE_DECL(5) /* cancel all pending salvages */ + +/* SALVSYNC reason codes */ +#define SALVSYNC_WHATEVER SYNC_REASON_CODE_DECL(0) /* XXXX */ +#define SALVSYNC_ERROR SYNC_REASON_CODE_DECL(1) /* volume is in error state */ +#define SALVSYNC_OPERATOR SYNC_REASON_CODE_DECL(2) /* operator forced salvage */ +#define SALVSYNC_SHUTDOWN SYNC_REASON_CODE_DECL(3) /* cancel due to shutdown */ +#define SALVSYNC_NEEDED SYNC_REASON_CODE_DECL(4) /* needsSalvaged flag set */ + +/* SALVSYNC response codes */ + +/* SALVSYNC flags */ +#define SALVSYNC_FLAG_VOL_STATS_VALID SYNC_FLAG_CODE_DECL(0) /* volume stats in response are valid */ + +/* SALVSYNC command state fields */ +#define SALVSYNC_STATE_UNKNOWN 0 /* unknown state */ +#define SALVSYNC_STATE_QUEUED 1 /* salvage request on queue */ +#define SALVSYNC_STATE_SALVAGING 2 /* salvage is happening now */ +#define SALVSYNC_STATE_ERROR 3 /* salvage ended in an error */ +#define SALVSYNC_STATE_DONE 4 /* last salvage ended successfully */ + + +typedef struct SALVSYNC_command_hdr { + afs_uint32 prio; + afs_uint32 volume; + char partName[16]; /* partition name, e.g. /vicepa */ +} SALVSYNC_command_hdr; + +typedef struct SALVSYNC_response_hdr { + afs_int32 state; + afs_int32 prio; + afs_int32 sq_len; + afs_int32 pq_len; +} SALVSYNC_response_hdr; + +typedef struct SALVSYNC_command { + SYNC_command_hdr * hdr; + SALVSYNC_command_hdr * sop; + SYNC_command * com; +} SALVSYNC_command; + +typedef struct SALVSYNC_response { + SYNC_response_hdr * hdr; + SALVSYNC_response_hdr * sop; + SYNC_response * res; +} SALVSYNC_response; + +typedef struct SALVSYNC_command_info { + SYNC_command_hdr com; + SALVSYNC_command_hdr sop; +} SALVSYNC_command_info; + +struct SalvageQueueNode { + struct rx_queue q; + struct rx_queue hash_chain; + afs_uint32 state; + struct SALVSYNC_command_info command; + afs_int32 partition_id; + int pid; +}; + + +/* Prototypes from salvsync.c */ + +/* online salvager client interfaces */ +extern int SALVSYNC_clientFinis(void); +extern int SALVSYNC_clientInit(void); +extern int SALVSYNC_clientReconnect(void); +extern afs_int32 SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res); +extern afs_int32 SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int com, int reason, + afs_uint32 prio, SYNC_response * res); + +/* salvage server interfaces */ +extern void SALVSYNC_salvInit(void); +extern struct SalvageQueueNode * SALVSYNC_getWork(void); +extern void SALVSYNC_doneWork(struct SalvageQueueNode *, int result); +extern void SALVSYNC_doneWorkByPid(int pid, int result); + +#endif /* AFS_DEMAND_ATTACH_FS */ + +#endif /* _AFS_VOL_SALVSYNC_H */ diff --git a/src/vol/test/listVicepx.c b/src/vol/test/listVicepx.c index 7cb53d7d42..7e9307ee1a 100644 --- a/src/vol/test/listVicepx.c +++ b/src/vol/test/listVicepx.c @@ -102,6 +102,7 @@ RCSID #include "afs/assert.h" #include "filesignal.h" #include "vutils.h" +#include "daemon_com.h" #include "fssync.h" #include #include diff --git a/src/vol/test/updateDirInode.c b/src/vol/test/updateDirInode.c index 1ebbcda15c..ff2d6b27d0 100644 --- a/src/vol/test/updateDirInode.c +++ b/src/vol/test/updateDirInode.c @@ -102,6 +102,7 @@ RCSID #include "afs/assert.h" #include "filesignal.h" #include "vutils.h" +#include "daemon_com.h" #include "fssync.h" #include #include diff --git a/src/vol/vnode.c b/src/vol/vnode.c index c9a6c0c58c..75e90bd6ac 100644 --- a/src/vol/vnode.c +++ b/src/vol/vnode.c @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -46,6 +48,7 @@ RCSID #include "vnode.h" #include "volume.h" #include "partition.h" +#include "salvsync.h" #if defined(AFS_SGI_ENV) #include "sys/types.h" #include "fcntl.h" @@ -73,8 +76,8 @@ RCSID struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES]; private int moveHash(register Vnode * vnp, bit32 newHash); -void StickOnLruChain_r(register Vnode * vnp, - register struct VnodeClassInfo *vcp); +private void StickOnLruChain_r(register Vnode * vnp, + register struct VnodeClassInfo *vcp); #define BAD_IGET -1000 @@ -162,6 +165,83 @@ private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE]; #define VNODE_HASH(volumeptr,vnodenumber)\ ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1)) +/* + * new support to secondarily hash vnodes by volume id + */ +#define VNVOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask)) + +#include "rx/rx_queue.h" +typedef struct VnodeHashByVolumeChainHead { + struct rx_queue queue; + int len; + /* someday we could put a per-chain lock here... */ +#ifdef AFS_DEMAND_ATTACH_FS + int busy; + pthread_cond_t chain_busy_cv; +#endif /* AFS_DEMAND_ATTACH_FS */ +} VnodeHashByVolumeChainHead; +private VnodeHashByVolumeChainHead *VnodeHashByVolumeTable = NULL; + +void +VInitVnHashByVolume(void) +{ + register int i; + + VnodeHashByVolumeTable = (VnodeHashByVolumeChainHead *) calloc(VolumeHashTable.Size, + sizeof(VnodeHashByVolumeChainHead)); + assert(VnodeHashByVolumeTable != NULL); + + for (i=0; i < VolumeHashTable.Size; i++) { + queue_Init(&VnodeHashByVolumeTable[i]); +#ifdef AFS_DEMAND_ATTACH_FS + assert(pthread_cond_init(&VnodeHashByVolumeTable[i].chain_busy_cv, NULL) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ + } +} + +static void +AddToVnHashByVolumeTable(register Vnode * vnp) +{ + VnodeHashByVolumeChainHead * head; + + if (queue_IsOnQueue(vnp)) + return; + + head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)]; + +#ifdef AFS_DEMAND_ATTACH_FS + while (head->busy) { + /* if the hash table is busy, wait */ + assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + head->len++; + queue_Append(head, vnp); +} + +/* for demand-attach, caller MUST hold a ref count on vp */ +static void +DeleteFromVnHashByVolumeTable(register Vnode * vnp) +{ + VnodeHashByVolumeChainHead * head; + + if (!queue_IsOnQueue(vnp)) + return; + + head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)]; + +#ifdef AFS_DEMAND_ATTACH_FS + while (head->busy) { + /* if the hash table is busy, wait */ + assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + head->len--; + queue_Remove(vnp); +} + /* Code to invalidate a vnode entry. Called when we've damaged a vnode, and want to prevent future VGetVnode's from applying to it. Leaves it in the same hash bucket but that shouldn't be important. */ @@ -305,7 +385,7 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type) unique = vp->nextVnodeUnique++; if (vp->nextVnodeUnique > V_uniquifier(vp)) { - VUpdateVolume_r(ec, vp); + VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT); if (*ec) return NULL; } @@ -317,7 +397,8 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type) } /* Find a slot in the bit map */ - bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class]); + bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class], + VOL_ALLOC_BITMAP_WAIT); if (*ec) return NULL; vnodeNumber = bitNumberToVnodeNumber(bitNumber, class); @@ -376,7 +457,6 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type) vnp->volumePtr = vp; vnp->cacheCheck = vp->cacheCheck; vnp->nUsers = 1; - moveHash(vnp, newHash); /* This will never block */ ObtainWriteLock(&vnp->lock); #ifdef AFS_PTHREAD_ENV @@ -391,18 +471,33 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type) FdHandle_t *fdP; off_t off = vnodeIndexOffset(vcp, vnodeNumber); + /* XXX we have a potential race here if two threads + * allocate new vnodes at the same time, and they + * both decide it's time to extend the index + * file size... */ + VOL_UNLOCK; fdP = IH_OPEN(ihP); - if (fdP == NULL) - Abort("VAllocVnode: can't open index file!\n"); - if ((size = FDH_SIZE(fdP)) < 0) - Abort("VAllocVnode: can't stat index file!\n"); - if (FDH_SEEK(fdP, off, SEEK_SET) < 0) - Abort("VAllocVnode: can't seek on index file!\n"); - if (off < size) { - if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) == vcp->diskSize) { - if (vnp->disk.type != vNull) - Abort("VAllocVnode: addled bitmap or index!\n"); + if (fdP == NULL) { + Log("VAllocVnode: can't open index file!\n"); + goto error_encountered; + } + if ((size = FDH_SIZE(fdP)) < 0) { + Log("VAllocVnode: can't stat index file!\n"); + goto error_encountered; + } + if (FDH_SEEK(fdP, off, SEEK_SET) < 0) { + Log("VAllocVnode: can't seek on index file!\n"); + goto error_encountered; + } + if (off + vcp->diskSize <= size) { + if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) { + Log("VAllocVnode: can't read index file!\n"); + goto error_encountered; + } + if (vnp->disk.type != vNull) { + Log("VAllocVnode: addled bitmap or index!\n"); + goto error_encountered; } } else { /* growing file - grow in a reasonable increment */ @@ -414,9 +509,28 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type) free(buf); } FDH_CLOSE(fdP); + fdP = NULL; VOL_LOCK; + goto sane; + + error_encountered: +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + if (fdP) + FDH_CLOSE(fdP); + VInvalidateVnode_r(vnp); + StickOnLruChain_r(vnp, vcp); + return NULL; +#else + assert(1 == 2); +#endif + } + sane: VNLog(4, 2, vnodeNumber, (afs_int32) vnp); + AddToVnHashByVolumeTable(vnp); + moveHash(vnp, newHash); } VNLog(5, 1, (afs_int32) vnp); @@ -510,6 +624,8 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype) vcp->reads++; vnp = VGetFreeVnode_r(vcp); /* Remove it from the old hash chain */ + if (vnp->volumePtr) + DeleteFromVnHashByVolumeTable(vnp); moveHash(vnp, newHash); /* Remove it from the LRU chain */ if (vnp == vcp->lruHead) @@ -525,6 +641,7 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype) vnp->volumePtr = vp; vnp->cacheCheck = vp->cacheCheck; vnp->nUsers = 1; + AddToVnHashByVolumeTable(vnp); /* This will never block */ ObtainWriteLock(&vnp->lock); @@ -540,11 +657,21 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype) if (fdP == NULL) { Log("VGetVnode: can't open index dev=%u, i=%s\n", vp->device, PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino)); +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + VOL_UNLOCK; +#endif *ec = VIO; mlkReason = 9; } else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, vnodeNumber), SEEK_SET) < 0) { Log("VGetVnode: can't seek on index file vn=%u\n", vnodeNumber); +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + VOL_UNLOCK; +#endif *ec = VIO; mlkReason = 10; FDH_REALLYCLOSE(fdP); @@ -564,8 +691,18 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype) * is not allocated */ if (n == -1 && errno == EIO) { Log("VGetVnode: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp)); - VForceOffline_r(vp); +#ifdef AFS_DEMAND_ATTACH_FS + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; + } else { + VForceOffline_r(vp, 0); + *ec = VSALVAGE; + } +#else + VForceOffline_r(vp, 0); *ec = VSALVAGE; +#endif mlkReason = 4; } else { mlkReason = 5; @@ -603,9 +740,19 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype) *ec = VNOVNODE; } else { Log("VGetVnode: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp)); +#ifdef AFS_DEMAND_ATTACH_FS + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; + } else { + vp->goingOffline = 1; + *ec = VSALVAGE; + } +#else vp->goingOffline = 1; /* used to call VOffline, but that would mess * up the volume ref count if called here */ *ec = VSALVAGE; +#endif mlkReason = 7; } VInvalidateVnode_r(vnp); @@ -728,20 +875,27 @@ VPutVnode_r(Error * ec, register Vnode * vnp) /* The vnode has been changed. Write it out to disk */ if (!V_inUse(vp)) { +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; +#else assert(V_needsSalvaged(vp)); *ec = VSALVAGE; +#endif } else { IHandle_t *ihP = vp->vnodeIndex[class].handle; FdHandle_t *fdP; VOL_UNLOCK; fdP = IH_OPEN(ihP); - if (fdP == NULL) - Abort("VPutVnode: can't open index file!\n"); + if (fdP == NULL) { + Log("VPutVnode: can't open index file!\n"); + goto error_encountered; + } offset = vnodeIndexOffset(vcp, vnp->vnodeNumber); if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) { - Abort - ("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n", - fdP, offset, errno); + Log("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n", + fdP, offset, errno); + goto error_encountered; } code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize); if (code != vcp->diskSize) { @@ -756,8 +910,13 @@ VPutVnode_r(Error * ec, register Vnode * vnp) *ec = VIO; } else { Log("VPutVnode: Couldn't write vnode %u, volume %u (%s) (error %d)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr), code); - VForceOffline_r(vp); +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; +#else + VForceOffline_r(vp, 0); *ec = VSALVAGE; +#endif } VOL_UNLOCK; FDH_REALLYCLOSE(fdP); @@ -765,6 +924,23 @@ VPutVnode_r(Error * ec, register Vnode * vnp) FDH_CLOSE(fdP); } VOL_LOCK; + goto sane; + + error_encountered: +#ifdef AFS_DEMAND_ATTACH_FS + /* XXX instead of dumping core, let's try to request a salvage + * and just fail the putvnode */ + if (fdP) + FDH_CLOSE(fdP); + VOL_LOCK; + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; + goto done; +#else + assert(1 == 2); +#endif + + sane: /* If the vnode is to be deleted, and we wrote the vnode out, * free its bitmap entry. Do after the vnode is written so we * don't allocate from bitmap before the vnode is written @@ -787,6 +963,7 @@ VPutVnode_r(Error * ec, register Vnode * vnp) vnp); } + done: /* Do not look at disk portion of vnode after this point; it may * have been deleted above */ if (vnp->nUsers-- == 1) @@ -865,19 +1042,28 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp) /* The inode has been changed. Write it out to disk */ if (!V_inUse(vp)) { +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; +#else assert(V_needsSalvaged(vp)); *ec = VSALVAGE; +#endif } else { IHandle_t *ihP = vp->vnodeIndex[class].handle; FdHandle_t *fdP; off_t off = vnodeIndexOffset(vcp, vnp->vnodeNumber); VOL_UNLOCK; fdP = IH_OPEN(ihP); - if (fdP == NULL) - Abort("VPutVnode: can't open index file!\n"); + if (fdP == NULL) { + Log("VPutVnode: can't open index file!\n"); + goto error_encountered; + } code = FDH_SEEK(fdP, off, SEEK_SET); - if (code < 0) - Abort("VPutVnode: can't seek on index file!\n"); + if (code < 0) { + Log("VPutVnode: can't seek on index file!\n"); + goto error_encountered; + } code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize); if (code != vcp->diskSize) { /* @@ -892,14 +1078,33 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp) *ec = VIO; } else { Log("VPutVnode: Couldn't write vnode %u, volume %u (%s)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr)); - VForceOffline_r(vp); +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; +#else + VForceOffline_r(vp, 0); *ec = VSALVAGE; +#endif } VOL_UNLOCK; } FDH_CLOSE(fdP); VOL_LOCK; + goto sane; + + error_encountered: +#ifdef AFS_DEMAND_ATTACH_FS + if (fdP) + FDH_CLOSE(fdP); + VOL_LOCK; + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + *ec = VSALVAGING; +#else + assert(1 == 2); +#endif + } + sane: vcp->writes++; vnp->changed_newTime = vnp->changed_oldTime = 0; } @@ -931,7 +1136,7 @@ moveHash(register Vnode * vnp, bit32 newHash) return 0; } -void +private void StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp) { /* Add it to the circular LRU list */ @@ -950,8 +1155,10 @@ StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp) vcp->lruHead = vnp->lruNext; /* If caching is turned off, set volumeptr to NULL to invalidate the * entry */ - if (!TrustVnodeCacheEntry) + if (!TrustVnodeCacheEntry) { + DeleteFromVnHashByVolumeTable(vnp); vnp->volumePtr = NULL; + } } /* VCloseVnodeFiles - called when a volume is going off line. All open @@ -962,15 +1169,30 @@ void VCloseVnodeFiles_r(Volume * vp) { int i; - Vnode *vnp; + Vnode *vnp, *nvnp; + VnodeHashByVolumeChainHead * head; - for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) { - for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) { - if (vnp->volumePtr == vp) { - IH_REALLYCLOSE(vnp->handle); - } + head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)]; +#ifdef AFS_DEMAND_ATTACH_FS + while (head->busy) { + assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0); + } + + head->busy = 1; + VOL_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + + for (queue_Scan(head, vnp, nvnp, Vnode)) { + if (vnp->volumePtr == vp) { + IH_REALLYCLOSE(vnp->handle); } } + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + head->busy = 0; + assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ } /* VReleaseVnodeFiles - called when a volume is going detached. All open @@ -981,13 +1203,29 @@ void VReleaseVnodeFiles_r(Volume * vp) { int i; - Vnode *vnp; + Vnode *vnp, *nvnp; + VnodeHashByVolumeChainHead * head; - for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) { - for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) { - if (vnp->volumePtr == vp) { - IH_RELEASE(vnp->handle); - } + head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)]; + +#ifdef AFS_DEMAND_ATTACH_FS + while (head->busy) { + assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0); + } + + head->busy = 1; + VOL_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + + for (queue_Scan(head, vnp, nvnp, Vnode)) { + if (vnp->volumePtr == vp) { + IH_RELEASE(vnp->handle); } } + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + head->busy = 0; + assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ } diff --git a/src/vol/vnode.h b/src/vol/vnode.h index 9446f79320..618cb83635 100644 --- a/src/vol/vnode.h +++ b/src/vol/vnode.h @@ -118,6 +118,7 @@ typedef struct VnodeDiskObject { #define SIZEOF_LARGEDISKVNODE 256 typedef struct Vnode { + struct rx_queue vid_hash; /* for vnode by volume id hash */ struct Vnode *hashNext; /* Next vnode on hash conflict chain */ struct Vnode *lruNext; /* Less recently used vnode than this one */ struct Vnode *lruPrev; /* More recently used vnode than this one */ @@ -216,3 +217,4 @@ extern Vnode *VAllocVnode(Error * ec, struct Volume *vp, VnodeType type); extern Vnode *VAllocVnode_r(Error * ec, struct Volume *vp, VnodeType type); /*extern VFreeVnode();*/ extern Vnode *VGetFreeVnode_r(struct VnodeClassInfo *vcp); +extern void VInitVnHashByVolume(void); diff --git a/src/vol/vol-salvage.c b/src/vol/vol-salvage.c index 04eb2694f9..eaaf6b96e2 100644 --- a/src/vol/vol-salvage.c +++ b/src/vol/vol-salvage.c @@ -83,11 +83,6 @@ Vnodes with 0 inode pointers in RW volumes are now deleted. */ -#define SalvageVersion "2.4" - -/* Main program file. Define globals. */ -#define MAIN 1 - #include #include @@ -186,10 +181,13 @@ RCSID #include "vnode.h" #include "volume.h" #include "partition.h" +#include "daemon_com.h" #include "fssync.h" +#include "salvsync.h" #include "viceinode.h" #include "salvage.h" #include "volinodes.h" /* header magic number, etc. stuff */ +#include "vol-salvage.h" #ifdef AFS_NT40_ENV #include #endif @@ -221,10 +219,6 @@ extern void *calloc(); #endif static char *TimeStamp(time_t clock, int precision); -#define ORPH_IGNORE 0 -#define ORPH_REMOVE 1 -#define ORPH_ATTACH 2 - int debug; /* -d flag */ int Testing = 0; /* -n flag */ @@ -251,7 +245,7 @@ int OKToZap; /* -o flag */ int ForceSalvage; /* If salvage should occur despite the DONT_SALVAGE flag * in the volume header */ -static FILE *logFile = 0; /* one of {/usr/afs/logs,/vice/file}/SalvageLog */ +FILE *logFile = 0; /* one of {/usr/afs/logs,/vice/file}/SalvageLog */ #define ROOTINODE 2 /* Root inode of a 4.2 Unix file system * partition */ @@ -279,201 +273,30 @@ int VolumeChanged; /* Set by any routine which would change the volume in VolumeDiskData VolInfo; /* A copy of the last good or salvaged volume header dealt with */ -struct InodeSummary { /* Inode summary file--an entry for each - * volume in the inode file for a partition */ - VolId volumeId; /* Volume id */ - VolId RWvolumeId; /* RW volume associated */ - int index; /* index into inode file (0, 1, 2 ...) */ - int nInodes; /* Number of inodes for this volume */ - int nSpecialInodes; /* Number of special inodes, i.e. volume - * header, index, etc. These are all - * marked (viceinode.h) and will all be sorted - * to the beginning of the information for - * this volume. Read-only volumes should - * ONLY have special inodes (all the other - * inodes look as if they belong to the - * original RW volume). */ - Unique maxUniquifier; /* The maximum uniquifier found in all the inodes. - * This is only useful for RW volumes and is used - * to compute a new volume uniquifier in the event - * that the header needs to be recreated. The inode - * uniquifier may be a truncated version of vnode - * uniquifier (AFS_3DISPARES). The real maxUniquifer - * is from the vnodes and later calcuated from it */ - struct VolumeSummary *volSummary; - /* Either a pointer to the original volume - * header summary, or constructed summary - * information */ -} *inodeSummary; -#define readOnly(isp) ((isp)->volumeId != (isp)->RWvolumeId) int nVolumesInInodeFile; /* Number of read-write volumes summarized */ int inodeFd; /* File descriptor for inode file */ -struct VolumeSummary { /* Volume summary an entry for each - * volume in a volume directory. - * Assumption: one volume directory per - * partition */ - char *fileName; /* File name on the partition for the volume - * header */ - struct VolumeHeader header; - /* volume number, rw volume number, inode - * numbers of each major component of - * the volume */ - IHandle_t *volumeInfoHandle; - byte wouldNeedCallback; /* set if the file server should issue - * call backs for all the files in this volume when - * the volume goes back on line */ -}; - -struct VnodeInfo { - IHandle_t *handle; /* Inode containing this index */ - int nVnodes; /* Total number of vnodes in index */ - int nAllocatedVnodes; /* Total number actually used */ - int volumeBlockCount; /* Total number of blocks used by volume */ - Inode *inodes; /* Directory only */ - struct VnodeEssence { - short count; /* Number of references to vnode; MUST BE SIGNED */ - unsigned claimed:1; /* Set when a parent directory containing an entry - * referencing this vnode is found. The claim - * is that the parent in "parent" can point to - * this vnode, and no other */ - unsigned changed:1; /* Set if any parameters (other than the count) - * in the vnode change. It is determined if the - * link count has changed by noting whether it is - * 0 after scanning all directories */ - unsigned salvaged:1; /* Set if this directory vnode has already been salvaged. */ - unsigned todelete:1; /* Set if this vnode is to be deleted (should not be claimed) */ - afs_fsize_t blockCount; - /* Number of blocks (1K) used by this vnode, - * approximately */ - VnodeId parent; /* parent in vnode */ - Unique unique; /* Must match entry! */ - char *name; /* Name of directory entry */ - int modeBits; /* File mode bits */ - Inode InodeNumber; /* file's inode */ - int type; /* File type */ - int author; /* File author */ - int owner; /* File owner */ - int group; /* File group */ - } *vnodes; -} vnodeInfo[nVNODECLASSES]; - -struct DirSummary { - struct DirHandle dirHandle; - VnodeId vnodeNumber; - Unique unique; - unsigned haveDot, haveDotDot; - VolumeId rwVid; - int copied; /* If the copy-on-write stuff has been applied */ - VnodeId parent; - char *name; - char *vname; - IHandle_t *ds_linkH; -}; +struct VnodeInfo vnodeInfo[nVNODECLASSES]; struct VolumeSummary *volumeSummaryp; /* Holds all the volumes in a part */ int nVolumes; /* Number of volumes (read-write and read-only) * in volume summary */ -#ifdef AFS_NT40_ENV -/* For NT, we can fork the per partition salvagers to gain the required - * safety against Aborts. But there's too many complex data structures at - * the per volume salvager layer to easilty copy the data across. - * childJobNumber is resset from -1 to the job number if this is a - * per partition child of the main salvager. This information is passed - * out-of-band in the extra data area setup for the now unused parent/child - * data transfer. - */ -#define SALVAGER_MAGIC 0x00BBaaDD -#define NOT_CHILD -1 /* job numbers start at 0 */ -/* If new options need to be passed to child, add them here. */ -typedef struct { - int cj_magic; - int cj_number; - char cj_part[32]; -} childJob_t; +extern char * tmpdir = 0; + +#ifdef AFS_NT40_ENV /* Child job this process is running. */ childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" }; - -int nt_SalvagePartition(char *partName, int jobn); -int nt_SetupPartitionSalvage(void *datap, int len); - -typedef struct { - struct InodeSummary *svgp_inodeSummaryp; - int svgp_count; -} SVGParms_t; -#define canfork 0 -#else -#define canfork 1 -#endif +#endif /* AFS_NT40_ENV */ /* Forward declarations */ /*@printflike@*/ void Log(const char *format, ...); /*@printflike@*/ void Abort(const char *format, ...); -void Exit(int code); -int Fork(void); -int Wait(char *prog); -char *ToString(char *s); -void AskOffline(VolumeId volumeId); -void AskOnline(VolumeId volumeId, char *partition); -void CheckLogFile(void); -#ifndef AFS_NT40_ENV -void TimeStampLogFile(void); -#endif -void ClearROInUseBit(struct VolumeSummary *summary); -void CopyAndSalvage(register struct DirSummary *dir); -int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume); -void CopyOnWrite(register struct DirSummary *dir); -void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes, - register struct InodeSummary *summary); -void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp); -void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino, - Unique * maxu); -int GetInodeSummary(char *path, VolumeId singleVolumeNumber); -void GetVolumeSummary(VolumeId singleVolumeNumber); -void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber, - Unique unique); -void MaybeZapVolume(register struct InodeSummary *isp, char *message, - int deleteMe, int check); -void ObtainSalvageLock(void); -void PrintInodeList(void); -void PrintInodeSummary(void); -void PrintVolumeSummary(void); -int QuickCheck(register struct InodeSummary *isp, int nVols); -void RemoveTheForce(char *path); -void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo, - IHandle_t * alinkH, int i, struct DirSummary *rootdir, - int *rootdirfound); -void SalvageFileSysParallel(struct DiskPartition *partP); -void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber); -void SalvageFileSys1(struct DiskPartition *partP, - VolumeId singleVolumeNumber); -int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp, - int check, int *deleteMe); -int SalvageIndex(Inode ino, VnodeClass class, int RW, - register struct ViceInodeInfo *ip, int nInodes, - struct VolumeSummary *volSummary, int check); -int SalvageVnodes(register struct InodeSummary *rwIsp, - register struct InodeSummary *thisIsp, - register struct ViceInodeInfo *inodes, int check); -int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH); -void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols); -#ifdef AFS_NT40_ENV -void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols); -#else -#define SalvageVolumeGroup DoSalvageVolumeGroup -#endif -int SalvageVolumeHeaderFile(register struct InodeSummary *isp, - register struct ViceInodeInfo *inodes, int RW, - int check, int *deleteMe); -void showlog(void); -int UseTheForceLuke(char *path); - static int IsVnodeOrphaned(VnodeId vnode); /* Uniquifier stored in the Inode */ @@ -500,207 +323,6 @@ BadError(register int aerror) } -char *tmpdir = 0; -static int -handleit(struct cmd_syndesc *as) -{ - register struct cmd_item *ti; - char pname[100], *temp; - afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0; - struct DiskPartition *partP; - -#ifdef AFS_SGI_VNODE_GLUE - if (afs_init_kernel_config(-1) < 0) { - printf - ("Can't determine NUMA configuration, not starting salvager.\n"); - exit(1); - } -#endif - -#ifdef FAST_RESTART - { - afs_int32 i; - for (i = 0; i < CMD_MAXPARMS; i++) { - if (as->parms[i].items) { - seenany = 1; - break; - } - } - } - if (!seenany) { - char *msg = - "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!"; - - if (useSyslog) - Log(msg); - else - printf("%s\n", msg); - - Exit(0); - } -#endif /* FAST_RESTART */ - if ((ti = as->parms[0].items)) { /* -partition */ - seenpart = 1; - strncpy(pname, ti->data, 100); - } - if ((ti = as->parms[1].items)) { /* -volumeid */ - if (!seenpart) { - printf - ("You must also specify '-partition' option with the '-volumeid' option\n"); - exit(-1); - } - seenvol = 1; - vid = atoi(ti->data); - } - if (as->parms[2].items) /* -debug */ - debug = 1; - if (as->parms[3].items) /* -nowrite */ - Testing = 1; - if (as->parms[4].items) /* -inodes */ - ListInodeOption = 1; - if (as->parms[5].items) /* -force */ - ForceSalvage = 1; - if (as->parms[6].items) /* -oktozap */ - OKToZap = 1; - if (as->parms[7].items) /* -rootinodes */ - ShowRootFiles = 1; - if (as->parms[8].items) /* -RebuildDirs */ - RebuildDirs = 1; - if (as->parms[9].items) /* -ForceReads */ - forceR = 1; - if ((ti = as->parms[10].items)) { /* -Parallel # */ - temp = ti->data; - if (strncmp(temp, "all", 3) == 0) { - PartsPerDisk = 1; - temp += 3; - } - if (strlen(temp) != 0) { - Parallel = atoi(temp); - if (Parallel < 1) - Parallel = 1; - if (Parallel > MAXPARALLEL) { - printf("Setting parallel salvages to maximum of %d \n", - MAXPARALLEL); - Parallel = MAXPARALLEL; - } - } - } - if ((ti = as->parms[11].items)) { /* -tmpdir */ - DIR *dirp; - - tmpdir = ti->data; - dirp = opendir(tmpdir); - if (!dirp) { - printf - ("Can't open temporary placeholder dir %s; using current partition \n", - tmpdir); - tmpdir = NULL; - } else - closedir(dirp); - } - if ((ti = as->parms[12].items)) /* -showlog */ - ShowLog = 1; - if ((ti = as->parms[13].items)) { /* -log */ - Testing = 1; - ShowSuid = 1; - Showmode = 1; - } - if ((ti = as->parms[14].items)) { /* -showmounts */ - Testing = 1; - Showmode = 1; - ShowMounts = 1; - } - if ((ti = as->parms[15].items)) { /* -orphans */ - if (Testing) - orphans = ORPH_IGNORE; - else if (strcmp(ti->data, "remove") == 0 - || strcmp(ti->data, "r") == 0) - orphans = ORPH_REMOVE; - else if (strcmp(ti->data, "attach") == 0 - || strcmp(ti->data, "a") == 0) - orphans = ORPH_ATTACH; - } -#ifndef AFS_NT40_ENV /* ignore options on NT */ - if ((ti = as->parms[16].items)) { /* -syslog */ - useSyslog = 1; - ShowLog = 0; - } - if ((ti = as->parms[17].items)) { /* -syslogfacility */ - useSyslogFacility = atoi(ti->data); - } - - if ((ti = as->parms[18].items)) { /* -datelogs */ - TimeStampLogFile(); - } -#endif - -#ifdef FAST_RESTART - if (ti = as->parms[19].items) { /* -DontSalvage */ - char *msg = - "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!"; - - if (useSyslog) - Log(msg); - else - printf("%s\n", msg); - Exit(0); - } -#endif /* FAST_RESTART */ - - /* Note: if seemvol we initialize this as a standard volume utility: this has the - * implication that the file server may be running; negotations have to be made with - * the file server in this case to take the read write volume and associated read-only - * volumes off line before salvaging */ -#ifdef AFS_NT40_ENV - if (seenvol) { - if (afs_winsockInit() < 0) { - ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0, - AFSDIR_SALVAGER_FILE, 0); - Log("Failed to initailize winsock, exiting.\n"); - Exit(1); - } - } -#endif - VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5, - DONT_CONNECT_FS, 0); - DInit(10); -#ifdef AFS_NT40_ENV - if (myjob.cj_number != NOT_CHILD) { - if (!seenpart) { - seenpart = 1; - (void)strcpy(pname, myjob.cj_part); - } - } -#endif - if (seenpart == 0) { - for (partP = DiskPartitionList; partP; partP = partP->next) { - SalvageFileSysParallel(partP); - } - SalvageFileSysParallel(0); - } else { - partP = VGetPartition(pname, 0); - if (!partP) { - Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname); - Exit(1); - } - if (!seenvol) - SalvageFileSys(partP, 0); - else { - /* Salvage individual volume */ - if (vid <= 0) { - Log("salvage: invalid volume id specified; salvage aborted\n"); - Exit(1); - } - SalvageFileSys(partP, vid); - } - } - return (0); -} - - -#ifndef AFS_NT40_ENV -#include "AFS_component_version_number.c" -#endif #define MAX_ARGS 128 #ifdef AFS_NT40_ENV char *save_args[MAX_ARGS]; @@ -708,143 +330,6 @@ int n_save_args = 0; pthread_t main_thread; #endif -int -main(int argc, char **argv) -{ - struct cmd_syndesc *ts; - int err = 0; - char commandLine[150]; - - int i; - extern char cml_version_number[]; - -#ifdef AFS_AIX32_ENV - /* - * The following signal action for AIX is necessary so that in case of a - * crash (i.e. core is generated) we can include the user's data section - * in the core dump. Unfortunately, by default, only a partial core is - * generated which, in many cases, isn't too useful. - */ - struct sigaction nsa; - - sigemptyset(&nsa.sa_mask); - nsa.sa_handler = SIG_DFL; - nsa.sa_flags = SA_FULLDUMP; - sigaction(SIGABRT, &nsa, NULL); - sigaction(SIGSEGV, &nsa, NULL); -#endif - - /* Initialize directory paths */ - if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) { -#ifdef AFS_NT40_ENV - ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0); -#endif - fprintf(stderr, "%s: Unable to obtain AFS server directory.\n", - argv[0]); - exit(2); - } -#ifdef AFS_NT40_ENV - main_thread = pthread_self(); - if (spawnDatap && spawnDataLen) { - /* This is a child per partition salvager. Don't setup log or - * try to lock the salvager lock. - */ - if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0) - exit(3); - } else { -#endif - for (commandLine[0] = '\0', i = 0; i < argc; i++) { - if (i > 0) - strcat(commandLine, " "); - strcat(commandLine, argv[i]); - } - - /* All entries to the log will be appended. Useful if there are - * multiple salvagers appending to the log. - */ - - CheckLogFile(); -#ifndef AFS_NT40_ENV -#ifdef AFS_LINUX20_ENV - fcntl(fileno(logFile), F_SETFL, O_APPEND); /* Isn't this redundant? */ -#else - fcntl(fileno(logFile), F_SETFL, FAPPEND); /* Isn't this redundant? */ -#endif -#endif - setlinebuf(logFile); - -#ifndef AFS_NT40_ENV - if (geteuid() != 0) { - printf("Salvager must be run as root.\n"); - fflush(stdout); - Exit(0); - } -#endif - - /* bad for normal help flag processing, but can do nada */ - - fprintf(logFile, "%s\n", cml_version_number); - Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine); - - /* Get and hold a lock for the duration of the salvage to make sure - * that no other salvage runs at the same time. The routine - * VInitVolumePackage (called below) makes sure that a file server or - * other volume utilities don't interfere with the salvage. - */ - ObtainSalvageLock(); -#ifdef AFS_NT40_ENV - } -#endif - - ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program"); - cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, - "Name of partition to salvage"); - cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL, - "Volume Id to salvage"); - cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL, - "Run in Debugging mode"); - cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL, - "Run readonly/test mode"); - cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL, - "Just list affected afs inodes - debugging flag"); - cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging"); - cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL, - "Give permission to destroy bogus inodes/volumes - debugging flag"); - cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL, - "Show inodes owned by root - debugging flag"); - cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL, - "Force rebuild/salvage of all directories"); - cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL, - "Read smaller blocks to handle IO/bad blocks"); - cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL, - "# of max parallel partition salvaging"); - cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL, - "Name of dir to place tmp files "); - cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL, - "Show log file upon completion"); - cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL, - "Report on suid/sgid files"); - cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL, - "Report on mountpoints"); - cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL, - "ignore | remove | attach"); - - /* note - syslog isn't avail on NT, but if we make it conditional, have - * to deal with screwy offsets for cmd params */ - cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL, - "Write salvage log to syslogs"); - cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL, - "Syslog facility number to use"); - cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL, - "Include timestamp in logfile filename"); - -#ifdef FAST_RESTART - cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL, - "Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline"); -#endif /* FAST_RESTART */ - err = cmd_Dispatch(argc, argv); - Exit(err); -} /* Get the salvage lock if not already held. Hold until process exits. */ void @@ -1249,7 +734,8 @@ SalvageFileSys1(struct DiskPartition *partP, VolumeId singleVolumeNumber) ForceSalvage = UseTheForceLuke(fileSysPath); if (singleVolumeNumber) { - if (!VConnectFS()) { + /* salvageserver already setup fssync conn for us */ + if ((programType != salvageServer) && !VConnectFS()) { Abort("Couldn't connect to file server\n"); } AskOffline(singleVolumeNumber); @@ -2554,7 +2040,7 @@ SalvageIndex(Inode ino, VnodeClass class, int RW, * if no such match, take the first determined by our sort * order */ register struct ViceInodeInfo *lip = ip; - register lnInodes = nInodes; + register int lnInodes = nInodes; while (lnInodes && lip->u.vnode.vnodeNumber == vnodeNumber) { if (VNDISK_GET_INO(vnode) == lip->inodeNumber) { @@ -3628,8 +3114,38 @@ MaybeZapVolume(register struct InodeSummary *isp, char *message, int deleteMe, void AskOffline(VolumeId volumeId) { - if (FSYNC_askfs(volumeId, NULL, FSYNC_OFF, FSYNC_SALVAGE) == FSYNC_DENIED) { - Log("AskOffline: file server denied offline request; a general salvage is required.\n"); + afs_int32 code, i; + + for (i = 0; i < 3; i++) { + code = FSYNC_VolOp(volumeId, NULL, FSYNC_VOL_OFF, FSYNC_SALVAGE, NULL); + + if (code == SYNC_OK) { + break; + } else if (code == SYNC_DENIED) { +#ifdef DEMAND_ATTACH_ENABLE + Log("AskOffline: file server denied offline request; a general salvage may be required.\n"); +#else + Log("AskOffline: file server denied offline request; a general salvage is required.\n"); +#endif + Abort("Salvage aborted\n"); + } else if (code == SYNC_BAD_COMMAND) { + Log("AskOffline: fssync protocol mismatch (bad command word '%d'); salvage aborting.\n", + FSYNC_VOL_OFF); +#ifdef DEMAND_ATTACH_ENABLE + Log("AskOffline: please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n"); +#else + Log("AskOffline: please make sure fileserver, volserver and salvager binaries are same version.\n"); +#endif + Abort("Salvage aborted\n"); + } else if (i < 2) { + /* try it again */ + Log("AskOffline: request for fileserver to take volume offline failed; trying again...\n"); + FSYNC_clientFinis(); + FSYNC_clientInit(); + } + } + if (code != SYNC_OK) { + Log("AskOffline: request for fileserver to take volume offline failed; salvage aborting.\n"); Abort("Salvage aborted\n"); } } @@ -3637,8 +3153,30 @@ AskOffline(VolumeId volumeId) void AskOnline(VolumeId volumeId, char *partition) { - if (FSYNC_askfs(volumeId, partition, FSYNC_ON, 0) == FSYNC_DENIED) { - Log("AskOnline: file server denied online request to volume %u partition %s\n", volumeId, partition); + afs_int32 code, i; + + for (i = 0; i < 3; i++) { + code = FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, FSYNC_WHATEVER, NULL); + + if (code == SYNC_OK) { + break; + } else if (code == SYNC_DENIED) { + Log("AskOnline: file server denied online request to volume %u partition %s; trying again...\n", volumeId, partition); + } else if (code == SYNC_BAD_COMMAND) { + Log("AskOnline: fssync protocol mismatch (bad command word '%d')\n", + FSYNC_VOL_ON); +#ifdef DEMAND_ATTACH_ENABLE + Log("AskOnline: please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n"); +#else + Log("AskOnline: please make sure fileserver, volserver and salvager binaries are same version.\n"); +#endif + break; + } else if (i < 2) { + /* try it again */ + Log("AskOnline: request for fileserver to take volume offline failed; trying again...\n"); + FSYNC_clientFinis(); + FSYNC_clientInit(); + } } } @@ -3772,7 +3310,7 @@ TimeStamp(time_t clock, int precision) } void -CheckLogFile(void) +CheckLogFile(char * log_path) { char oldSlvgLog[AFSDIR_PATH_MAX]; @@ -3783,11 +3321,11 @@ CheckLogFile(void) } #endif - strcpy(oldSlvgLog, AFSDIR_SERVER_SLVGLOG_FILEPATH); + strcpy(oldSlvgLog, log_path); strcat(oldSlvgLog, ".old"); if (!logFile) { - renamefile(AFSDIR_SERVER_SLVGLOG_FILEPATH, oldSlvgLog); - logFile = afs_fopen(AFSDIR_SERVER_SLVGLOG_FILEPATH, "a"); + renamefile(log_path, oldSlvgLog); + logFile = afs_fopen(log_path, "a"); if (!logFile) { /* still nothing, use stdout */ logFile = stdout; @@ -3801,7 +3339,7 @@ CheckLogFile(void) #ifndef AFS_NT40_ENV void -TimeStampLogFile(void) +TimeStampLogFile(char * log_path) { char stampSlvgLog[AFSDIR_PATH_MAX]; struct tm *lt; @@ -3811,13 +3349,13 @@ TimeStampLogFile(void) lt = localtime(&now); (void)afs_snprintf(stampSlvgLog, sizeof stampSlvgLog, "%s.%04d-%02d-%02d.%02d:%02d:%02d", - AFSDIR_SERVER_SLVGLOG_FILEPATH, lt->tm_year + 1900, + log_path, lt->tm_year + 1900, lt->tm_mon + 1, lt->tm_mday, lt->tm_hour, lt->tm_min, lt->tm_sec); /* try to link the logfile to a timestamped filename */ /* if it fails, oh well, nothing we can do */ - link(AFSDIR_SERVER_SLVGLOG_FILEPATH, stampSlvgLog); + link(log_path, stampSlvgLog); } #endif @@ -3937,7 +3475,7 @@ UseTheForceLuke(char *path) * * NOTE: * The VRMIX fsck will not muck with the filesystem it is supposedly - * fixing and create a "FORCESAVAGE" file (by design). Instead, we + * fixing and create a "FORCESALVAGE" file (by design). Instead, we * muck directly with the root inode, which is within the normal * domain of fsck. * ListViceInodes() has a side effect of setting ForceSalvage if diff --git a/src/vol/vol-salvage.h b/src/vol/vol-salvage.h new file mode 100644 index 0000000000..c95ce249dc --- /dev/null +++ b/src/vol/vol-salvage.h @@ -0,0 +1,282 @@ +/* + * Copyright 2000, International Business Machines Corporation and others. + * All Rights Reserved. + * + * This software has been released under the terms of the IBM Public + * License. For details, see the LICENSE file in the top-level source + * directory or online at http://www.openafs.org/dl/license10.html + */ + +/* + * Module: vol-salvage.h + */ + +#ifndef __vol_salvage_h_ +#define __vol_salvage_h_ + +#define SalvageVersion "2.4" + +#include "salvage.h" +#include "volinodes.h" + +/* salvager data structures */ +struct InodeSummary { /* Inode summary file--an entry for each + * volume in the inode file for a partition */ + VolId volumeId; /* Volume id */ + VolId RWvolumeId; /* RW volume associated */ + int index; /* index into inode file (0, 1, 2 ...) */ + int nInodes; /* Number of inodes for this volume */ + int nSpecialInodes; /* Number of special inodes, i.e. volume + * header, index, etc. These are all + * marked (viceinode.h) and will all be sorted + * to the beginning of the information for + * this volume. Read-only volumes should + * ONLY have special inodes (all the other + * inodes look as if they belong to the + * original RW volume). */ + Unique maxUniquifier; /* The maximum uniquifier found in all the inodes. + * This is only useful for RW volumes and is used + * to compute a new volume uniquifier in the event + * that the header needs to be recreated. The inode + * uniquifier may be a truncated version of vnode + * uniquifier (AFS_3DISPARES). The real maxUniquifer + * is from the vnodes and later calcuated from it */ + struct VolumeSummary *volSummary; + /* Either a pointer to the original volume + * header summary, or constructed summary + * information */ +} *inodeSummary; +#define readOnly(isp) ((isp)->volumeId != (isp)->RWvolumeId) + +struct VolumeSummary { /* Volume summary an entry for each + * volume in a volume directory. + * Assumption: one volume directory per + * partition */ + char *fileName; /* File name on the partition for the volume + * header */ + struct VolumeHeader header; + /* volume number, rw volume number, inode + * numbers of each major component of + * the volume */ + IHandle_t *volumeInfoHandle; + byte wouldNeedCallback; /* set if the file server should issue + * call backs for all the files in this volume when + * the volume goes back on line */ +}; + +struct VnodeInfo { + IHandle_t *handle; /* Inode containing this index */ + int nVnodes; /* Total number of vnodes in index */ + int nAllocatedVnodes; /* Total number actually used */ + int volumeBlockCount; /* Total number of blocks used by volume */ + Inode *inodes; /* Directory only */ + struct VnodeEssence { + short count; /* Number of references to vnode; MUST BE SIGNED */ + unsigned claimed:1; /* Set when a parent directory containing an entry + * referencing this vnode is found. The claim + * is that the parent in "parent" can point to + * this vnode, and no other */ + unsigned changed:1; /* Set if any parameters (other than the count) + * in the vnode change. It is determined if the + * link count has changed by noting whether it is + * 0 after scanning all directories */ + unsigned salvaged:1; /* Set if this directory vnode has already been salvaged. */ + unsigned todelete:1; /* Set if this vnode is to be deleted (should not be claimed) */ + afs_fsize_t blockCount; + /* Number of blocks (1K) used by this vnode, + * approximately */ + VnodeId parent; /* parent in vnode */ + Unique unique; /* Must match entry! */ + char *name; /* Name of directory entry */ + int modeBits; /* File mode bits */ + Inode InodeNumber; /* file's inode */ + int type; /* File type */ + int author; /* File author */ + int owner; /* File owner */ + int group; /* File group */ + } *vnodes; +}; + +struct DirSummary { + struct DirHandle dirHandle; + VnodeId vnodeNumber; + Unique unique; + unsigned haveDot, haveDotDot; + VolumeId rwVid; + int copied; /* If the copy-on-write stuff has been applied */ + VnodeId parent; + char *name; + char *vname; + IHandle_t *ds_linkH; +}; + +#define ORPH_IGNORE 0 +#define ORPH_REMOVE 1 +#define ORPH_ATTACH 2 + + +/* command line options */ +extern int debug; /* -d flag */ +extern int Testing; /* -n flag */ +extern int ListInodeOption; /* -i flag */ +extern int ShowRootFiles; /* -r flag */ +extern int RebuildDirs; /* -sal flag */ +extern int Parallel; /* -para X flag */ +extern int PartsPerDisk; /* Salvage up to 8 partitions on same disk sequentially */ +extern int forceR; /* -b flag */ +extern int ShowLog; /* -showlog flag */ +extern int ShowSuid; /* -showsuid flag */ +extern int ShowMounts; /* -showmounts flag */ +extern int orphans; /* -orphans option */ +extern int Showmode; + +#ifndef AFS_NT40_ENV +extern int useSyslog; /* -syslog flag */ +extern int useSyslogFacility; /* -syslogfacility option */ +#endif + +#define MAXPARALLEL 32 + +extern int OKToZap; /* -o flag */ +extern int ForceSalvage; /* If salvage should occur despite the DONT_SALVAGE flag + * in the volume header */ + + +#define ROOTINODE 2 /* Root inode of a 4.2 Unix file system + * partition */ +extern Device fileSysDevice; /* The device number of the current + * partition being salvaged */ +#ifdef AFS_NT40_ENV +extern char fileSysPath[8]; +#else +extern char *fileSysPath; /* The path of the mounted partition currently + * being salvaged, i.e. the directory + * containing the volume headers */ +#endif /* AFS_NT40_ENV */ +extern char *fileSysPathName; /* NT needs this to make name pretty in log. */ +extern IHandle_t *VGLinkH; /* Link handle for current volume group. */ +extern int VGLinkH_cnt; /* # of references to lnk handle. */ +extern struct DiskPartition *fileSysPartition; /* Partition being salvaged */ +#ifndef AFS_NT40_ENV +extern char *fileSysDeviceName; /* The block device where the file system + * being salvaged was mounted */ +extern char *filesysfulldev; +#endif /* AFS_NT40_ENV */ +extern int VolumeChanged; /* Set by any routine which would change the volume in + * a way which would require callback is to be broken if the + * volume was put back on line by an active file server */ + +extern VolumeDiskData VolInfo; /* A copy of the last good or salvaged volume header dealt with */ + +extern int nVolumesInInodeFile; /* Number of read-write volumes summarized */ +extern int inodeFd; /* File descriptor for inode file */ + + +extern struct VnodeInfo vnodeInfo[nVNODECLASSES]; + + +extern struct VolumeSummary *volumeSummaryp; /* Holds all the volumes in a part */ +extern int nVolumes; /* Number of volumes (read-write and read-only) + * in volume summary */ + +extern char * tmpdir; +extern FILE *logFile; /* one of {/usr/afs/logs,/vice/file}/SalvageLog */ + + +#ifdef AFS_NT40_ENV +/* For NT, we can fork the per partition salvagers to gain the required + * safety against Aborts. But there's too many complex data structures at + * the per volume salvager layer to easilty copy the data across. + * childJobNumber is resset from -1 to the job number if this is a + * per partition child of the main salvager. This information is passed + * out-of-band in the extra data area setup for the now unused parent/child + * data transfer. + */ +#define SALVAGER_MAGIC 0x00BBaaDD +#define NOT_CHILD -1 /* job numbers start at 0 */ +/* If new options need to be passed to child, add them here. */ +typedef struct { + int cj_magic; + int cj_number; + char cj_part[32]; +} childJob_t; + +/* Child job this process is running. */ +extern childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" }; + +extern int nt_SalvagePartition(char *partName, int jobn); +extern int nt_SetupPartitionSalvage(void *datap, int len); + +typedef struct { + struct InodeSummary *svgp_inodeSummaryp; + int svgp_count; +} SVGParms_t; +#define canfork 0 +#else /* AFS_NT40_ENV */ +#define canfork 1 +#endif /* AFS_NT40_ENV */ + + +/* prototypes */ +extern void Exit(int code); +extern int Fork(void); +extern int Wait(char *prog); +extern char *ToString(char *s); +extern void AskOffline(VolumeId volumeId); +extern void AskOnline(VolumeId volumeId, char *partition); +extern void CheckLogFile(char * log_path); +#ifndef AFS_NT40_ENV +extern void TimeStampLogFile(char * log_path); +#endif +extern void ClearROInUseBit(struct VolumeSummary *summary); +extern void CopyAndSalvage(register struct DirSummary *dir); +extern int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume); +extern void CopyOnWrite(register struct DirSummary *dir); +extern void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes, + register struct InodeSummary *summary); +extern void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp); +extern void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino, + Unique * maxu); +extern int GetInodeSummary(char *path, VolumeId singleVolumeNumber); +extern void GetVolumeSummary(VolumeId singleVolumeNumber); +extern void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber, + Unique unique); +extern void MaybeZapVolume(register struct InodeSummary *isp, char *message, + int deleteMe, int check); +extern void ObtainSalvageLock(void); +extern void PrintInodeList(void); +extern void PrintInodeSummary(void); +extern void PrintVolumeSummary(void); +extern int QuickCheck(register struct InodeSummary *isp, int nVols); +extern void RemoveTheForce(char *path); +extern void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo, + IHandle_t * alinkH, int i, struct DirSummary *rootdir, + int *rootdirfound); +extern void SalvageFileSysParallel(struct DiskPartition *partP); +extern void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber); +extern void SalvageFileSys1(struct DiskPartition *partP, + VolumeId singleVolumeNumber); +extern int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp, + int check, int *deleteMe); +extern int SalvageIndex(Inode ino, VnodeClass class, int RW, + register struct ViceInodeInfo *ip, int nInodes, + struct VolumeSummary *volSummary, int check); +extern int SalvageVnodes(register struct InodeSummary *rwIsp, + register struct InodeSummary *thisIsp, + register struct ViceInodeInfo *inodes, int check); +extern int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH); +extern void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols); +#ifdef AFS_NT40_ENV +extern void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols); +#else +#define SalvageVolumeGroup DoSalvageVolumeGroup +#endif +extern int SalvageVolumeHeaderFile(register struct InodeSummary *isp, + register struct ViceInodeInfo *inodes, int RW, + int check, int *deleteMe); +extern void showlog(void); +extern int UseTheForceLuke(char *path); + + + +#endif /* __vol_salvage_h_ */ diff --git a/src/vol/voldefs.h b/src/vol/voldefs.h index 2094a0ca04..b546be24f3 100644 --- a/src/vol/voldefs.h +++ b/src/vol/voldefs.h @@ -25,6 +25,9 @@ #define ROVOL 1 #define BACKVOL 2 +/* maximum numbe of Vice partitions */ +#define VOLMAXPARTS 255 + /* All volumes will have a volume header name in this format */ #if defined(AFS_AIX_ENV) || defined(AFS_HPUX_ENV) /* Note that must have been included before we get here... */ diff --git a/src/vol/volinodes.h b/src/vol/volinodes.h index cb72b9c0b6..37b00fef6b 100644 --- a/src/vol/volinodes.h +++ b/src/vol/volinodes.h @@ -14,6 +14,9 @@ */ +#ifndef __volinodes_h_ +#define __volinodes_h_ + /* Used by vutil.c and salvager.c */ private struct VolumeHeader tempHeader; @@ -56,3 +59,5 @@ LINKTABLEMAGIC, LINKTABLEVERSION}, VI_LINKTABLE, #define MAXINODETYPE VI_LINKTABLE Volume *VWaitAttachVolume(); + +#endif /* __volinodes_h_ */ diff --git a/src/vol/volume.c b/src/vol/volume.c index 7eb8854e86..fae9f87b56 100644 --- a/src/vol/volume.c +++ b/src/vol/volume.c @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* 1/1/89: NB: this stuff is all going to be replaced. Don't take it too seriously */ @@ -121,6 +123,9 @@ RCSID #ifdef AFS_NT40_ENV #include #endif +#include "daemon_com.h" +#include "fssync.h" +#include "salvsync.h" #include "vnode.h" #include "volume.h" #include "partition.h" @@ -130,11 +135,15 @@ RCSID #include "afs/assert.h" #endif /* AFS_PTHREAD_ENV */ #include "vutils.h" -#include "fssync.h" +#include #ifndef AFS_NT40_ENV #include #endif +#if !defined(offsetof) +#include +#endif + #ifdef O_LARGEFILE #define afs_stat stat64 #define afs_fstat fstat64 @@ -147,14 +156,16 @@ RCSID #ifdef AFS_PTHREAD_ENV pthread_mutex_t vol_glock_mutex; -pthread_mutex_t vol_attach_mutex; -pthread_mutex_t vol_fsync_mutex; pthread_mutex_t vol_trans_mutex; pthread_cond_t vol_put_volume_cond; pthread_cond_t vol_sleep_cond; int vol_attach_threads = 1; #endif /* AFS_PTHREAD_ENV */ +#ifdef AFS_DEMAND_ATTACH_FS +pthread_mutex_t vol_salvsync_mutex; +#endif /* AFS_DEMAND_ATTACH_FS */ + #ifdef AFS_OSF_ENV extern void *calloc(), *realloc(); #endif @@ -162,12 +173,18 @@ extern void *calloc(), *realloc(); /*@printflike@*/ extern void Log(const char *format, ...); /* Forward declarations */ -static Volume *attach2(Error * ec, char *path, +static Volume *attach2(Error * ec, VolId vid, char *path, register struct VolumeHeader *header, - struct DiskPartition *partp, int isbusy); + struct DiskPartition *partp, Volume * vp, + int isbusy, int mode); +static void ReallyFreeVolume(Volume * vp); +#ifdef AFS_DEMAND_ATTACH_FS static void FreeVolume(Volume * vp); +#else /* !AFS_DEMAND_ATTACH_FS */ +#define FreeVolume(vp) ReallyFreeVolume(vp) static void VScanUpdateList(void); -static void InitLRU(int howMany); +#endif /* !AFS_DEMAND_ATTACH_FS */ +static void VInitVolumeHeaderCache(afs_uint32 howMany); static int GetVolumeHeader(register Volume * vp); static void ReleaseVolumeHeader(register struct volHeader *hd); static void FreeVolumeHeader(register Volume * vp); @@ -175,22 +192,72 @@ static void AddVolumeToHashTable(register Volume * vp, int hashid); static void DeleteVolumeFromHashTable(register Volume * vp); static int VHold(Volume * vp); static int VHold_r(Volume * vp); -static void GetBitmap(Error * ec, Volume * vp, VnodeClass class); +static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class); static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep); static void VReleaseVolumeHandles_r(Volume * vp); static void VCloseVolumeHandles_r(Volume * vp); +static void LoadVolumeHeader(Error * ec, Volume * vp); +static int VCheckOffline(register Volume * vp); +static int VCheckDetach(register Volume * vp); +static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags); +static int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len); int LogLevel; /* Vice loglevel--not defined as extern so that it will be * defined when not linked with vice, XXXX */ ProgramType programType; /* The type of program using the package */ +/* extended volume package statistics */ +VolPkgStats VStats; + + #define VOLUME_BITMAP_GROWSIZE 16 /* bytes, => 128vnodes */ /* Must be a multiple of 4 (1 word) !! */ -#define VOLUME_HASH_TABLE_SIZE 128 /* Must be a power of 2!! */ -#define VOLUME_HASH(volumeId) (volumeId&(VOLUME_HASH_TABLE_SIZE-1)) -private Volume *VolumeHashTable[VOLUME_HASH_TABLE_SIZE]; + +/* this parameter needs to be tunable at runtime. + * 128 was really inadequate for largish servers -- at 16384 volumes this + * puts average chain length at 128, thus an average 65 deref's to find a volptr. + * talk about bad spatial locality... + * + * an AVL or splay tree might work a lot better, but we'll just increase + * the default hash table size for now + */ +#define DEFAULT_VOLUME_HASH_SIZE 256 /* Must be a power of 2!! */ +#define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1) +#define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask)) + +/* + * turn volume hash chains into partially ordered lists. + * when the threshold is exceeded between two adjacent elements, + * perform a chain rebalancing operation. + * + * keep the threshold high in order to keep cache line invalidates + * low "enough" on SMPs + */ +#define VOLUME_HASH_REORDER_THRESHOLD 200 + +/* + * when possible, don't just reorder single elements, but reorder + * entire chains of elements at once. a chain of elements that + * exceed the element previous to the pivot by at least CHAIN_THRESH + * accesses are moved in front of the chain whose elements have at + * least CHAIN_THRESH less accesses than the pivot element + */ +#define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2) + +#include "rx/rx_queue.h" + + +VolumeHashTable_t VolumeHashTable = { + DEFAULT_VOLUME_HASH_SIZE, + DEFAULT_VOLUME_HASH_MASK, + NULL +}; + + +static void VInitVolumeHash(void); + #ifndef AFS_HAVE_FFS /* This macro is used where an ffs() call does not exist. Was in util/ffs.c */ @@ -211,7 +278,6 @@ ffs(x) #endif /* !AFS_HAVE_FFS */ #ifdef AFS_PTHREAD_ENV -#include "rx/rx_queue.h" typedef struct diskpartition_queue_t { struct rx_queue queue; struct DiskPartition * diskP; @@ -224,9 +290,120 @@ typedef struct vinitvolumepackage_thread_t { static void * VInitVolumePackageThread(void * args); #endif /* AFS_PTHREAD_ENV */ -struct Lock vol_listLock; /* Lock obtained when listing volumes: prevents a volume from being missed if the volume is attached during a list volumes */ +static int VAttachVolumesByPartition(struct DiskPartition *diskP, + int * nAttached, int * nUnattached); + + +#ifdef AFS_DEMAND_ATTACH_FS +/* demand attach fileserver extensions */ + +/* XXX + * in the future we will support serialization of VLRU state into the fs_state + * disk dumps + * + * these structures are the beginning of that effort + */ +struct VLRU_DiskHeader { + struct versionStamp stamp; /* magic and structure version number */ + afs_uint32 mtime; /* time of dump to disk */ + afs_uint32 num_records; /* number of VLRU_DiskEntry records */ +}; + +struct VLRU_DiskEntry { + afs_uint32 vid; /* volume ID */ + afs_uint32 idx; /* generation */ + afs_uint32 last_get; /* timestamp of last get */ +}; + +struct VLRU_StartupQueue { + struct VLRU_DiskEntry * entry; + int num_entries; + int next_idx; +}; + +typedef struct vshutdown_thread_t { + struct rx_queue q; + pthread_mutex_t lock; + pthread_cond_t cv; + pthread_cond_t master_cv; + int n_threads; + int n_threads_complete; + int vol_remaining; + int schedule_version; + int pass; + byte n_parts; + byte n_parts_done_pass; + byte part_thread_target[VOLMAXPARTS+1]; + byte part_done_pass[VOLMAXPARTS+1]; + struct rx_queue * part_pass_head[VOLMAXPARTS+1]; + int stats[4][VOLMAXPARTS+1]; +} vshutdown_thread_t; +static void * VShutdownThread(void * args); + + +static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode); +static int VCheckFree(Volume * vp); + +/* VByP List */ +static void AddVolumeToVByPList_r(Volume * vp); +static void DeleteVolumeFromVByPList_r(Volume * vp); +static void VVByPListBeginExclusive_r(struct DiskPartition * dp); +static void VVByPListEndExclusive_r(struct DiskPartition * dp); +static void VVByPListWait_r(struct DiskPartition * dp); + +/* online salvager */ +static int VCheckSalvage(register Volume * vp); +static int VUpdateSalvagePriority_r(Volume * vp); +static int VScheduleSalvage_r(Volume * vp); +static int VCancelSalvage_r(Volume * vp, int reason); + +/* Volume hash table */ +static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp); +static void VHashBeginExclusive_r(VolumeHashChainHead * head); +static void VHashEndExclusive_r(VolumeHashChainHead * head); +static void VHashWait_r(VolumeHashChainHead * head); + +/* Volume state machine */ +static void VCreateReservation_r(Volume * vp); +static void VCancelReservation_r(Volume * vp); +static void VWaitStateChange_r(Volume * vp); +static void VWaitExclusiveState_r(Volume * vp); +static int IsExclusiveState(VolState state); +static int IsErrorState(VolState state); +static int IsValidState(VolState state); + +/* shutdown */ +static int ShutdownVByPForPass_r(struct DiskPartition * dp, int pass); +static int ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass, + struct rx_queue ** idx); +static void ShutdownController(vshutdown_thread_t * params); +static void ShutdownCreateSchedule(vshutdown_thread_t * params); + +/* VLRU */ +static void VLRU_ComputeConstants(void); +static void VInitVLRU(void); +static void VLRU_Init_Node_r(volatile Volume * vp); +static void VLRU_Add_r(volatile Volume * vp); +static void VLRU_Delete_r(volatile Volume * vp); +static void VLRU_UpdateAccess_r(volatile Volume * vp); +static void * VLRU_ScannerThread(void * args); +static void VLRU_Scan_r(int idx); +static void VLRU_Promote_r(int idx); +static void VLRU_Demote_r(int idx); +static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append); + +/* soft detach */ +static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh); +static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh); +static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh); +#endif /* AFS_DEMAND_ATTACH_FS */ + + +struct Lock vol_listLock; /* Lock obtained when listing volumes: + * prevents a volume from being missed + * if the volume is attached during a + * list volumes */ -extern struct Lock FSYNC_handler_lock; static int TimeZoneCorrection; /* Number of seconds west of GMT */ @@ -247,12 +424,16 @@ bit32 VolumeCacheCheck; /* Incremented everytime a volume goes on line-- * vnode will be invalidated * access only with VOL_LOCK held */ -int VolumeCacheSize = 200, VolumeGets = 0, VolumeReplacements = 0, Vlooks = 0; + +/***************************************************/ +/* Startup routines */ +/***************************************************/ + int -VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, - int connect, int volcache) +VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVnodes, + int connect, afs_uint32 volcache) { int errors = 0; /* Number of errors while finding vice partitions. */ struct timeval tv; @@ -260,10 +441,24 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, programType = pt; +#ifdef AFS_DEMAND_ATTACH_FS + memset(&VStats, 0, sizeof(VStats)); + VStats.hdr_cache_size = 200; +#endif + + VInitPartitionPackage(); + VInitVolumeHash(); + VInitVnHashByVolume(); +#ifdef AFS_DEMAND_ATTACH_FS + if (programType == fileServer) { + VInitVLRU(); + } else { + VLRU_SetOptions(VLRU_SET_ENABLED, 0); + } +#endif + #ifdef AFS_PTHREAD_ENV assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0); - assert(pthread_mutex_init(&vol_attach_mutex, NULL) == 0); - assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0); assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0); assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0); assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0); @@ -271,25 +466,41 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, IOMGR_Initialize(); #endif /* AFS_PTHREAD_ENV */ Lock_Init(&vol_listLock); - Lock_Init(&FSYNC_handler_lock); + srandom(time(0)); /* For VGetVolumeInfo */ gettimeofday(&tv, &tz); TimeZoneCorrection = tz.tz_minuteswest * 60; +#ifdef AFS_DEMAND_ATTACH_FS + assert(pthread_mutex_init(&vol_salvsync_mutex, NULL) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ + /* Ok, we have done enough initialization that fileserver can * start accepting calls, even though the volumes may not be * available just yet. */ VInit = 1; +#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER) + if (programType == salvageServer) { + SALVSYNC_salvInit(); + } +#endif /* AFS_DEMAND_ATTACH_FS */ +#ifdef FSSYNC_BUILD_SERVER if (programType == fileServer) { - /* File server or "stand" */ FSYNC_fsInit(); } +#endif +#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT) + if (programType == fileServer) { + /* establish a connection to the salvager at this point */ + assert(VConnectSALV() != 0); + } +#endif /* AFS_DEMAND_ATTACH_FS */ - if (volcache > VolumeCacheSize) - VolumeCacheSize = volcache; - InitLRU(VolumeCacheSize); + if (volcache > VStats.hdr_cache_size) + VStats.hdr_cache_size = volcache; + VInitVolumeHeaderCache(VStats.hdr_cache_size); VInitVnodes(vLarge, nLargeVnodes); VInitVnodes(vSmall, nSmallVnodes); @@ -304,7 +515,7 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, #ifdef AFS_PTHREAD_ENV struct vinitvolumepackage_thread_t params; struct diskpartition_queue_t * dpq; - int i, len; + int i, threads, parts; pthread_t tid; pthread_attr_t attrs; @@ -313,29 +524,56 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, params.n_threads_complete = 0; /* create partition work queue */ - for (len=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, len++) { + for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) { dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t)); assert(dpq != NULL); dpq->diskP = diskP; queue_Prepend(¶ms,dpq); } - assert(pthread_attr_init(&attrs) == 0); - assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); + threads = MIN(parts, vol_attach_threads); - len = MIN(len, vol_attach_threads); - - VOL_LOCK; - for (i=0; i < len; i++) { - assert(pthread_create - (&tid, &attrs, &VInitVolumePackageThread, - ¶ms) == 0); - } + if (threads > 1) { + /* spawn off a bunch of initialization threads */ + assert(pthread_attr_init(&attrs) == 0); + assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); - while(params.n_threads_complete < len) { - pthread_cond_wait(¶ms.thread_done_cv,&vol_glock_mutex); + Log("VInitVolumePackage: beginning parallel fileserver startup\n"); +#ifdef AFS_DEMAND_ATTACH_FS + Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n", + threads, parts); +#else /* AFS_DEMAND_ATTACH_FS */ + Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n", + threads, parts); +#endif /* AFS_DEMAND_ATTACH_FS */ + + VOL_LOCK; + for (i=0; i < threads; i++) { + assert(pthread_create + (&tid, &attrs, &VInitVolumePackageThread, + ¶ms) == 0); + } + + while(params.n_threads_complete < threads) { + pthread_cond_wait(¶ms.thread_done_cv,&vol_glock_mutex); + } + VOL_UNLOCK; + + assert(pthread_attr_destroy(&attrs) == 0); + } else { + /* if we're only going to run one init thread, don't bother creating + * another LWP */ + Log("VInitVolumePackage: beginning single-threaded fileserver startup\n"); +#ifdef AFS_DEMAND_ATTACH_FS + Log("VInitVolumePackage: using 1 thread to pre-attach volumes on %d partition(s)\n", + parts); +#else /* AFS_DEMAND_ATTACH_FS */ + Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n", + parts); +#endif /* AFS_DEMAND_ATTACH_FS */ + + VInitVolumePackageThread(¶ms); } - VOL_UNLOCK; assert(pthread_cond_destroy(¶ms.thread_done_cv) == 0); @@ -346,44 +584,28 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes, /* Attach all the volumes in this partition */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { int nAttached = 0, nUnattached = 0; - Log("Partition %s: attaching volumes\n", diskP->name); - dirp = opendir(VPartitionPath(diskP)); - assert(dirp); - while ((dp = readdir(dirp))) { - char *p; - p = strrchr(dp->d_name, '.'); - if (p != NULL && strcmp(p, VHDREXT) == 0) { - Error error; - Volume *vp; - vp = VAttachVolumeByName(&error, diskP->name, dp->d_name, - V_VOLUPD); - (*(vp ? &nAttached : &nUnattached))++; - if (error == VOFFLINE) - Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name); - else if (LogLevel >= 5) { - Log("Partition %s: attached volume %d (%s)\n", - diskP->name, VolumeNumber(dp->d_name), - dp->d_name); - } - if (vp) { - VPutVolume(vp); - } - } - } - Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached); - closedir(dirp); + assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0); } #endif /* AFS_PTHREAD_ENV */ } VInit = 2; /* Initialized, and all volumes have been attached */ +#ifdef FSSYNC_BUILD_CLIENT if (programType == volumeUtility && connect) { if (!VConnectFS()) { Log("Unable to connect to file server; aborted\n"); - Lock_Destroy(&FSYNC_handler_lock); exit(1); } } +#ifdef AFS_DEMAND_ATTACH_FS + else if (programType == salvageServer) { + if (!VConnectFS()) { + Log("Unable to connect to file server; aborted\n"); + exit(1); + } + } +#endif /* AFS_DEMAND_ATTACH_FS */ +#endif /* FSSYNC_BUILD_CLIENT */ return 0; } @@ -412,32 +634,8 @@ VInitVolumePackageThread(void * args) { diskP = dpq->diskP; free(dpq); - Log("Partition %s: attaching volumes\n", diskP->name); - dirp = opendir(VPartitionPath(diskP)); - assert(dirp); - while ((dp = readdir(dirp))) { - char *p; - p = strrchr(dp->d_name, '.'); - if (p != NULL && strcmp(p, VHDREXT) == 0) { - Error error; - Volume *vp; - vp = VAttachVolumeByName(&error, diskP->name, dp->d_name, - V_VOLUPD); - (*(vp ? &nAttached : &nUnattached))++; - if (error == VOFFLINE) - Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name); - else if (LogLevel >= 5) { - Log("Partition %s: attached volume %d (%s)\n", - diskP->name, VolumeNumber(dp->d_name), - dp->d_name); - } - if (vp) { - VPutVolume(vp); - } - } - } - Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached); - closedir(dirp); + assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0); + VOL_LOCK; } @@ -448,46 +646,114 @@ VInitVolumePackageThread(void * args) { } #endif /* AFS_PTHREAD_ENV */ -/* This must be called by any volume utility which needs to run while the - file server is also running. This is separated from VInitVolumePackage so - that a utility can fork--and each of the children can independently - initialize communication with the file server */ -int -VConnectFS(void) +/* + * attach all volumes on a given disk partition + */ +static int +VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nUnattached) { - int retVal; - VOL_LOCK; - retVal = VConnectFS_r(); - VOL_UNLOCK; - return retVal; + DIR * dirp; + struct dirent * dp; + int ret = 0; + + Log("Partition %s: attaching volumes\n", diskP->name); + dirp = opendir(VPartitionPath(diskP)); + if (!dirp) { + Log("opendir on Partition %s failed!\n", diskP->name); + return 1; + } + + while ((dp = readdir(dirp))) { + char *p; + p = strrchr(dp->d_name, '.'); + if (p != NULL && strcmp(p, VHDREXT) == 0) { + Error error; + Volume *vp; +#ifdef AFS_DEMAND_ATTACH_FS + vp = VPreAttachVolumeByName(&error, diskP->name, dp->d_name, + V_VOLUPD); +#else /* AFS_DEMAND_ATTACH_FS */ + vp = VAttachVolumeByName(&error, diskP->name, dp->d_name, + V_VOLUPD); +#endif /* AFS_DEMAND_ATTACH_FS */ + (*(vp ? nAttached : nUnattached))++; + if (error == VOFFLINE) + Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name); + else if (LogLevel >= 5) { + Log("Partition %s: attached volume %d (%s)\n", + diskP->name, VolumeNumber(dp->d_name), + dp->d_name); + } +#if !defined(AFS_DEMAND_ATTACH_FS) + if (vp) { + VPutVolume(vp); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + } + } + + Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached); + closedir(dirp); + return ret; } -int -VConnectFS_r(void) -{ - int rc; - assert(VInit == 2 && programType == volumeUtility); - rc = FSYNC_clientInit(); - if (rc) - VInit = 3; - return rc; -} -void -VDisconnectFS_r(void) -{ - assert(programType == volumeUtility); - FSYNC_clientFinis(); - VInit = 2; -} +/***************************************************/ +/* Shutdown routines */ +/***************************************************/ -void -VDisconnectFS(void) -{ - VOL_LOCK; - VDisconnectFS_r(); - VOL_UNLOCK; -} +/* + * demand attach fs + * highly multithreaded volume package shutdown + * + * with the demand attach fileserver extensions, + * VShutdown has been modified to be multithreaded. + * In order to achieve optimal use of many threads, + * the shutdown code involves one control thread and + * n shutdown worker threads. The control thread + * periodically examines the number of volumes available + * for shutdown on each partition, and produces a worker + * thread allocation schedule. The idea is to eliminate + * redundant scheduling computation on the workers by + * having a single master scheduler. + * + * The scheduler's objectives are: + * (1) fairness + * each partition with volumes remaining gets allocated + * at least 1 thread (assuming sufficient threads) + * (2) performance + * threads are allocated proportional to the number of + * volumes remaining to be offlined. This ensures that + * the OS I/O scheduler has many requests to elevator + * seek on partitions that will (presumably) take the + * longest amount of time (from now) to finish shutdown + * (3) keep threads busy + * when there are extra threads, they are assigned to + * partitions using a simple round-robin algorithm + * + * In the future, we may wish to add the ability to adapt + * to the relative performance patterns of each disk + * partition. + * + * + * demand attach fs + * multi-step shutdown process + * + * demand attach shutdown is a four-step process. Each + * shutdown "pass" shuts down increasingly more difficult + * volumes. The main purpose is to achieve better cache + * utilization during shutdown. + * + * pass 0 + * shutdown volumes in the unattached, pre-attached + * and error states + * pass 1 + * shutdown attached volumes with cached volume headers + * pass 2 + * shutdown all volumes in non-exclusive states + * pass 3 + * shutdown all remaining volumes + */ void VShutdown_r(void) @@ -495,36 +761,139 @@ VShutdown_r(void) int i; register Volume *vp, *np; register afs_int32 code; +#ifdef AFS_DEMAND_ATTACH_FS + struct DiskPartition * diskP; + struct diskpartition_queue_t * dpq; + vshutdown_thread_t params; + pthread_t tid; + pthread_attr_t attrs; - Log("VShutdown: shutting down on-line volumes...\n"); - for (i = 0; i < VOLUME_HASH_TABLE_SIZE; i++) { - /* try to hold first volume in the hash table */ - for (vp = VolumeHashTable[i]; vp; vp = vp->hashNext) { - code = VHold_r(vp); - if (code == 0) - break; /* got it */ - /* otherwise we go around again, trying another volume */ - } - while (vp) { - if (LogLevel >= 5) - Log("VShutdown: Attempting to take volume %u offline.\n", - vp->hashid); - /* first compute np before releasing vp, in case vp disappears - * after releasing. Hold it, so it doesn't disapear. If we - * can't hold it, try the next one in the chain. Invariant - * at the top of this loop is that vp is held (has extra ref count). - */ - for (np = vp->hashNext; np; np = np->hashNext) { - code = VHold_r(np); - if (code == 0) - break; /* got it */ + memset(¶ms, 0, sizeof(vshutdown_thread_t)); + + for (params.n_parts=0, diskP = DiskPartitionList; + diskP; diskP = diskP->next, params.n_parts++); + + Log("VShutdown: shutting down on-line volumes on %d partition%s...\n", + params.n_parts, params.n_parts > 1 ? "s" : ""); + + if (vol_attach_threads > 1) { + /* prepare for parallel shutdown */ + params.n_threads = vol_attach_threads; + assert(pthread_mutex_init(¶ms.lock, NULL) == 0); + assert(pthread_cond_init(¶ms.cv, NULL) == 0); + assert(pthread_cond_init(¶ms.master_cv, NULL) == 0); + assert(pthread_attr_init(&attrs) == 0); + assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); + queue_Init(¶ms); + + /* setup the basic partition information structures for + * parallel shutdown */ + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + /* XXX debug */ + struct rx_queue * qp, * nqp; + Volume * vp; + int count = 0; + + VVByPListWait_r(diskP); + VVByPListBeginExclusive_r(diskP); + + /* XXX debug */ + for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) { + vp = (Volume *)((char *)qp - offsetof(Volume, vol_list)); + if (vp->header) + count++; + } + Log("VShutdown: partition %s has %d volumes with attached headers\n", + VPartitionPath(diskP), count); + + + /* build up the pass 0 shutdown work queue */ + dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t)); + assert(dpq != NULL); + dpq->diskP = diskP; + queue_Prepend(¶ms, dpq); + + params.part_pass_head[diskP->device] = queue_First(&diskP->vol_list, rx_queue); + } + + Log("VShutdown: beginning parallel fileserver shutdown\n"); + Log("VShutdown: using %d threads to offline volumes on %d partition%s\n", + vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" ); + + /* do pass 0 shutdown */ + assert(pthread_mutex_lock(¶ms.lock) == 0); + for (i=0; i < params.n_threads; i++) { + assert(pthread_create + (&tid, &attrs, &VShutdownThread, + ¶ms) == 0); + } + + /* wait for all the pass 0 shutdowns to complete */ + while (params.n_threads_complete < params.n_threads) { + assert(pthread_cond_wait(¶ms.master_cv, ¶ms.lock) == 0); + } + params.n_threads_complete = 0; + params.pass = 1; + assert(pthread_cond_broadcast(¶ms.cv) == 0); + assert(pthread_mutex_unlock(¶ms.lock) == 0); + + Log("VShutdown: pass 0 completed using the 1 thread per partition algorithm\n"); + Log("VShutdown: starting passes 1 through 3 using finely-granular mp-fast algorithm\n"); + + /* run the parallel shutdown scheduler. it will drop the glock internally */ + ShutdownController(¶ms); + + /* wait for all the workers to finish pass 3 and terminate */ + while (params.pass < 4) { + assert(pthread_cond_wait(¶ms.cv, &vol_glock_mutex) == 0); + } + + assert(pthread_attr_destroy(&attrs) == 0); + assert(pthread_cond_destroy(¶ms.cv) == 0); + assert(pthread_cond_destroy(¶ms.master_cv) == 0); + assert(pthread_mutex_destroy(¶ms.lock) == 0); + + /* drop the VByPList exclusive reservations */ + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + VVByPListEndExclusive_r(diskP); + Log("VShutdown: %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n", + VPartitionPath(diskP), + params.stats[0][diskP->device], + params.stats[1][diskP->device], + params.stats[2][diskP->device], + params.stats[3][diskP->device]); + } + + Log("VShutdown: shutdown finished using %d threads\n", params.n_threads); + } else { + /* if we're only going to run one shutdown thread, don't bother creating + * another LWP */ + Log("VShutdown: beginning single-threaded fileserver shutdown\n"); + + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + VShutdownByPartition_r(diskP); + } + } + + Log("VShutdown: complete.\n"); +#else /* AFS_DEMAND_ATTACH_FS */ + Log("VShutdown: shutting down on-line volumes...\n"); + for (i = 0; i < VolumeHashTable.Size; i++) { + /* try to hold first volume in the hash table */ + for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) { + code = VHold_r(vp); + if (code == 0) { + if (LogLevel >= 5) + Log("VShutdown: Attempting to take volume %u offline.\n", + vp->hashid); + + /* next, take the volume offline (drops reference count) */ + VOffline_r(vp, "File server was shut down"); } - /* next, take the volume offline (drops reference count) */ - VOffline_r(vp, "File server was shut down"); - vp = np; /* next guy to try */ } } Log("VShutdown: complete.\n"); +#endif /* AFS_DEMAND_ATTACH_FS */ } void @@ -535,7 +904,498 @@ VShutdown(void) VOL_UNLOCK; } +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * shutdown control thread + */ +static void +ShutdownController(vshutdown_thread_t * params) +{ + /* XXX debug */ + struct DiskPartition * diskP; + Device id; + vshutdown_thread_t shadow; + ShutdownCreateSchedule(params); + + while ((params->pass < 4) && + (params->n_threads_complete < params->n_threads)) { + /* recompute schedule once per second */ + + memcpy(&shadow, params, sizeof(vshutdown_thread_t)); + + VOL_UNLOCK; + /* XXX debug */ + Log("ShutdownController: schedule version=%d, vol_remaining=%d, pass=%d\n", + shadow.schedule_version, shadow.vol_remaining, shadow.pass); + Log("ShutdownController: n_threads_complete=%d, n_parts_done_pass=%d\n", + shadow.n_threads_complete, shadow.n_parts_done_pass); + for (diskP = DiskPartitionList; diskP; diskP=diskP->next) { + id = diskP->device; + Log("ShutdownController: part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n", + id, + diskP->vol_list.len, + shadow.part_thread_target[id], + shadow.part_done_pass[id], + shadow.part_pass_head[id]); + } + + sleep(1); + VOL_LOCK; + + ShutdownCreateSchedule(params); + } +} + +/* create the shutdown thread work schedule. + * this scheduler tries to implement fairness + * by allocating at least 1 thread to each + * partition with volumes to be shutdown, + * and then it attempts to allocate remaining + * threads based upon the amount of work left + */ +static void +ShutdownCreateSchedule(vshutdown_thread_t * params) +{ + struct DiskPartition * diskP; + int sum, thr_workload, thr_left; + int part_residue[VOLMAXPARTS+1]; + Device id; + + /* compute the total number of outstanding volumes */ + sum = 0; + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + sum += diskP->vol_list.len; + } + + params->schedule_version++; + params->vol_remaining = sum; + + if (!sum) + return; + + /* compute average per-thread workload */ + thr_workload = sum / params->n_threads; + if (sum % params->n_threads) + thr_workload++; + + thr_left = params->n_threads; + memset(&part_residue, 0, sizeof(part_residue)); + + /* for fairness, give every partition with volumes remaining + * at least one thread */ + for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) { + id = diskP->device; + if (diskP->vol_list.len) { + params->part_thread_target[id] = 1; + thr_left--; + } else { + params->part_thread_target[id] = 0; + } + } + + if (thr_left && thr_workload) { + /* compute length-weighted workloads */ + int delta; + + for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) { + id = diskP->device; + delta = (diskP->vol_list.len / thr_workload) - + params->part_thread_target[id]; + if (delta < 0) { + continue; + } + if (delta < thr_left) { + params->part_thread_target[id] += delta; + thr_left -= delta; + } else { + params->part_thread_target[id] += thr_left; + thr_left = 0; + break; + } + } + } + + if (thr_left) { + /* try to assign any leftover threads to partitions that + * had volume lengths closer to needing thread_target+1 */ + int max_residue, max_id; + + /* compute the residues */ + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + part_residue[id] = diskP->vol_list.len - + (params->part_thread_target[id] * thr_workload); + } + + /* now try to allocate remaining threads to partitions with the + * highest residues */ + while (thr_left) { + max_residue = 0; + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + if (part_residue[id] > max_residue) { + max_residue = part_residue[id]; + max_id = id; + } + } + + if (!max_residue) { + break; + } + + params->part_thread_target[max_id]++; + thr_left--; + part_residue[max_id] = 0; + } + } + + if (thr_left) { + /* punt and give any remaining threads equally to each partition */ + int alloc; + if (thr_left >= params->n_parts) { + alloc = thr_left / params->n_parts; + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + params->part_thread_target[id] += alloc; + thr_left -= alloc; + } + } + + /* finish off the last of the threads */ + for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) { + id = diskP->device; + params->part_thread_target[id]++; + thr_left--; + } + } +} + +/* worker thread for parallel shutdown */ +static void * +VShutdownThread(void * args) +{ + struct rx_queue *qp; + Volume * vp; + vshutdown_thread_t * params; + int part, code, found, pass, schedule_version_save, count; + struct DiskPartition *diskP; + struct diskpartition_queue_t * dpq; + Device id; + + params = (vshutdown_thread_t *) args; + + /* acquire the shutdown pass 0 lock */ + assert(pthread_mutex_lock(¶ms->lock) == 0); + + /* if there's still pass 0 work to be done, + * get a work entry, and do a pass 0 shutdown */ + if (queue_IsNotEmpty(params)) { + dpq = queue_First(params, diskpartition_queue_t); + queue_Remove(dpq); + assert(pthread_mutex_unlock(¶ms->lock) == 0); + diskP = dpq->diskP; + free(dpq); + id = diskP->device; + + count = 0; + while (ShutdownVolumeWalk_r(diskP, 0, ¶ms->part_pass_head[id])) + count++; + params->stats[0][diskP->device] = count; + assert(pthread_mutex_lock(¶ms->lock) == 0); + } + + params->n_threads_complete++; + if (params->n_threads_complete == params->n_threads) { + /* notify control thread that all workers have completed pass 0 */ + assert(pthread_cond_signal(¶ms->master_cv) == 0); + } + while (params->pass == 0) { + assert(pthread_cond_wait(¶ms->cv, ¶ms->lock) == 0); + } + + /* switch locks */ + assert(pthread_mutex_unlock(¶ms->lock) == 0); + VOL_LOCK; + + pass = params->pass; + assert(pass > 0); + + /* now escalate through the more complicated shutdowns */ + while (pass <= 3) { + schedule_version_save = params->schedule_version; + found = 0; + /* find a disk partition to work on */ + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + if (params->part_thread_target[id] && !params->part_done_pass[id]) { + params->part_thread_target[id]--; + found = 1; + break; + } + } + + if (!found) { + /* hmm. for some reason the controller thread couldn't find anything for + * us to do. let's see if there's anything we can do */ + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + if (diskP->vol_list.len && !params->part_done_pass[id]) { + found = 1; + break; + } else if (!params->part_done_pass[id]) { + params->part_done_pass[id] = 1; + params->n_parts_done_pass++; + if (pass == 3) { + Log("VShutdown: done shutting down volumes on partition %s.\n", + VPartitionPath(diskP)); + } + } + } + } + + /* do work on this partition until either the controller + * creates a new schedule, or we run out of things to do + * on this partition */ + if (found) { + count = 0; + while (!params->part_done_pass[id] && + (schedule_version_save == params->schedule_version)) { + /* ShutdownVolumeWalk_r will drop the glock internally */ + if (!ShutdownVolumeWalk_r(diskP, pass, ¶ms->part_pass_head[id])) { + if (!params->part_done_pass[id]) { + params->part_done_pass[id] = 1; + params->n_parts_done_pass++; + if (pass == 3) { + Log("VShutdown: done shutting down volumes on partition %s.\n", + VPartitionPath(diskP)); + } + } + break; + } + count++; + } + + params->stats[pass][id] += count; + } else { + /* ok, everyone is done this pass, proceed */ + + /* barrier lock */ + params->n_threads_complete++; + while (params->pass == pass) { + if (params->n_threads_complete == params->n_threads) { + /* we are the last thread to complete, so we will + * reinitialize worker pool state for the next pass */ + params->n_threads_complete = 0; + params->n_parts_done_pass = 0; + params->pass++; + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + params->part_done_pass[id] = 0; + params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue); + } + + /* compute a new thread schedule before releasing all the workers */ + ShutdownCreateSchedule(params); + + /* wake up all the workers */ + assert(pthread_cond_broadcast(¶ms->cv) == 0); + + VOL_UNLOCK; + Log("VShutdown: pass %d completed using %d threads on %d partitions\n", + pass, params->n_threads, params->n_parts); + VOL_LOCK; + } else { + assert(pthread_cond_wait(¶ms->cv, &vol_glock_mutex) == 0); + } + } + pass = params->pass; + } + + /* for fairness */ + VOL_UNLOCK; + pthread_yield(); + VOL_LOCK; + } + + VOL_UNLOCK; + + return NULL; +} + +/* shut down all volumes on a given disk partition + * + * note that this function will not allow mp-fast + * shutdown of a partition */ +int +VShutdownByPartition_r(struct DiskPartition * dp) +{ + int pass, retVal; + int pass_stats[4]; + int total; + + /* wait for other exclusive ops to finish */ + VVByPListWait_r(dp); + + /* begin exclusive access */ + VVByPListBeginExclusive_r(dp); + + /* pick the low-hanging fruit first, + * then do the complicated ones last + * (has the advantage of keeping + * in-use volumes up until the bitter end) */ + for (pass = 0, total=0; pass < 4; pass++) { + pass_stats[pass] = ShutdownVByPForPass_r(dp, pass); + total += pass_stats[pass]; + } + + /* end exclusive access */ + VVByPListEndExclusive_r(dp); + + Log("VShutdownByPartition: shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n", + total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]); + + return retVal; +} + +/* internal shutdown functionality + * + * for multi-pass shutdown: + * 0 to only "shutdown" {pre,un}attached and error state volumes + * 1 to also shutdown attached volumes w/ volume header loaded + * 2 to also shutdown attached volumes w/o volume header loaded + * 3 to also shutdown exclusive state volumes + * + * caller MUST hold exclusive access on the hash chain + * because we drop vol_glock_mutex internally + * + * this function is reentrant for passes 1--3 + * (e.g. multiple threads can cooperate to + * shutdown a partition mp-fast) + * + * pass 0 is not scaleable because the volume state data is + * synchronized by vol_glock mutex, and the locking overhead + * is too high to drop the lock long enough to do linked list + * traversal + */ +static int +ShutdownVByPForPass_r(struct DiskPartition * dp, int pass) +{ + struct rx_queue * q = queue_First(&dp->vol_list, rx_queue); + register int i = 0; + + while (ShutdownVolumeWalk_r(dp, pass, &q)) + i++; + + return i; +} + +/* conditionally shutdown one volume on partition dp + * returns 1 if a volume was shutdown in this pass, + * 0 otherwise */ +static int +ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass, + struct rx_queue ** idx) +{ + struct rx_queue *qp, *nqp; + Volume * vp; + + qp = *idx; + + for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) { + vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list)); + + switch (pass) { + case 0: + if ((V_attachState(vp) != VOL_STATE_UNATTACHED) && + (V_attachState(vp) != VOL_STATE_ERROR) && + (V_attachState(vp) != VOL_STATE_PREATTACHED)) { + break; + } + case 1: + if ((V_attachState(vp) == VOL_STATE_ATTACHED) && + (vp->header == NULL)) { + break; + } + case 2: + if (IsExclusiveState(V_attachState(vp))) { + break; + } + case 3: + *idx = nqp; + DeleteVolumeFromVByPList_r(vp); + VShutdownVolume_r(vp); + vp = NULL; + return 1; + } + } + + return 0; +} + +/* + * shutdown a specific volume + */ +/* caller MUST NOT hold a heavyweight ref on vp */ +int +VShutdownVolume_r(Volume * vp) +{ + int code; + + VCreateReservation_r(vp); + + if (LogLevel >= 5) { + Log("VShutdownVolume_r: vid=%u, device=%d, state=%hu\n", + vp->hashid, vp->partition->device, V_attachState(vp)); + } + + /* wait for other blocking ops to finish */ + VWaitExclusiveState_r(vp); + + assert(IsValidState(V_attachState(vp))); + + switch(V_attachState(vp)) { + case VOL_STATE_SALVAGING: + /* make sure salvager knows we don't want + * the volume back */ + VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN); + case VOL_STATE_PREATTACHED: + case VOL_STATE_ERROR: + VChangeState_r(vp, VOL_STATE_UNATTACHED); + case VOL_STATE_UNATTACHED: + break; + case VOL_STATE_GOING_OFFLINE: + case VOL_STATE_SHUTTING_DOWN: + case VOL_STATE_ATTACHED: + code = VHold_r(vp); + if (!code) { + if (LogLevel >= 5) + Log("VShutdown: Attempting to take volume %u offline.\n", + vp->hashid); + + /* take the volume offline (drops reference count) */ + VOffline_r(vp, "File server was shut down"); + } + break; + } + + VCancelReservation_r(vp); + vp = NULL; + return 0; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* Header I/O routines */ +/***************************************************/ + +/* open a descriptor for the inode (h), + * read in an on-disk structure into buffer (to) of size (size), + * verify versionstamp in structure has magic (magic) and + * optionally verify version (version) if (version) is nonzero + */ static void ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic, bit32 version) @@ -574,10 +1434,39 @@ ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic, } } +void +WriteVolumeHeader_r(Error * ec, Volume * vp) +{ + IHandle_t *h = V_diskDataHandle(vp); + FdHandle_t *fdP; + + *ec = 0; + + fdP = IH_OPEN(h); + if (fdP == NULL) { + *ec = VSALVAGE; + return; + } + if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) { + *ec = VSALVAGE; + FDH_REALLYCLOSE(fdP); + return; + } + if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp))) + != sizeof(V_disk(vp))) { + *ec = VSALVAGE; + FDH_REALLYCLOSE(fdP); + return; + } + FDH_CLOSE(fdP); +} + /* VolumeHeaderToDisk * Allows for storing 64 bit inode numbers in on-disk volume header * file. */ +/* convert in-memory representation of a volume header to the + * on-disk representation of a volume header */ void VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h) { @@ -607,8 +1496,10 @@ VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h) } /* DiskToVolumeHeader - * Reads volume header file from disk, convering 64 bit inodes - * if required. Makes the assumption that AFS has *always* + * Converts an on-disk representation of a volume header to + * the in-memory representation of a volume header. + * + * Makes the assumption that AFS has *always* * zero'd the volume header file so that high parts of inode * numbers are 0 in older (SGI EFS) volume header files. */ @@ -642,34 +1533,137 @@ DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh) } -void -WriteVolumeHeader_r(ec, vp) - Error *ec; - Volume *vp; +/***************************************************/ +/* Volume Attachment routines */ +/***************************************************/ + +#ifdef AFS_DEMAND_ATTACH_FS +/* pre-attach a volume given its path + * + * a pre-attached volume will only have its partition + * and hashid fields initialized + * + * at first call to VGetVolume, the volume will be + * fully attached + */ +Volume * +VPreAttachVolumeByName(Error * ec, char *partition, char *name, int mode) { - IHandle_t *h = V_diskDataHandle(vp); - FdHandle_t *fdP; + Volume * vp; + VOL_LOCK; + vp = VPreAttachVolumeByName_r(ec, partition, name, mode); + VOL_UNLOCK; + return vp; +} + +Volume * +VPreAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) +{ + register Volume *vp = NULL; + int fd, n; + struct afs_stat status; + struct DiskPartition *partp; + char path[64]; + int isbusy = 0; + VolId volumeId; + *ec = 0; + + assert(programType == fileServer); + + if (!(partp = VGetPartition_r(partition, 0))) { + *ec = VNOVOL; + Log("VPreAttachVolume: Error getting partition (%s)\n", partition); + return NULL; + } + + volumeId = VolumeNumber(name); + + vp = VLookupVolume_r(ec, volumeId, NULL); + if (*ec) { + return NULL; + } + + return VPreAttachVolumeById_r(ec, partp, vp, volumeId); +} + +/* pre-attach a volume given its partition and volume id + * + * if vp == NULL, then a new vp is created + * if vp != NULL, then we assumed it is already on the hash chain + */ +Volume * +VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, + Volume * vp, int vid) +{ + Volume *nvp = NULL; *ec = 0; - fdP = IH_OPEN(h); - if (fdP == NULL) { - *ec = VSALVAGE; - return; + /* check to see if pre-attach already happened */ + if (vp && + (V_attachState(vp) != VOL_STATE_UNATTACHED) && + !IsErrorState(V_attachState(vp))) { + goto done; + } else if (vp) { + /* we're re-attaching a volume; clear out some old state */ + memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage)); + } else { + /* if we need to allocate a new Volume struct, + * go ahead and drop the vol glock, otherwise + * do the basic setup synchronised, as it's + * probably not worth dropping the lock */ + VOL_UNLOCK; + + /* allocate the volume structure */ + vp = nvp = (Volume *) malloc(sizeof(Volume)); + assert(vp != NULL); + memset(vp, 0, sizeof(Volume)); + assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0); } - if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) { - *ec = VSALVAGE; - FDH_REALLYCLOSE(fdP); - return; + + /* link the volume with its associated vice partition */ + vp->device = partp->device; + vp->partition = partp; + vp->hashid = vid; + + /* if we dropped the lock, reacquire the lock, + * check for pre-attach races, and then add + * the volume to the hash table */ + if (nvp) { + VOL_LOCK; + nvp = VLookupVolume_r(ec, vid, NULL); + if (*ec) { + free(vp); + vp = NULL; + goto done; + } else if (nvp) { /* race detected */ + free(vp); + vp = nvp; + goto done; + } else { + /* hack to make up for VChangeState_r() decrementing + * the old state counter */ + VStats.state_levels[0]++; + } } - if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp))) - != sizeof(V_disk(vp))) { - *ec = VSALVAGE; - FDH_REALLYCLOSE(fdP); - return; - } - FDH_CLOSE(fdP); + + /* put pre-attached volume onto the hash table + * and bring it up to the pre-attached state */ + AddVolumeToHashTable(vp, vp->hashid); + AddVolumeToVByPList_r(vp); + VLRU_Init_Node_r(vp); + VChangeState_r(vp, VOL_STATE_PREATTACHED); + + if (LogLevel >= 5) + Log("VPreAttachVolumeById_r: volume %u pre-attached\n", vp->hashid); + + done: + if (*ec) + return NULL; + else + return vp; } +#endif /* AFS_DEMAND_ATTACH_FS */ /* Attach an existing volume, given its pathname, and return a pointer to the volume header information. The volume also @@ -679,18 +1673,16 @@ Volume * VAttachVolumeByName(Error * ec, char *partition, char *name, int mode) { Volume *retVal; - VATTACH_LOCK; VOL_LOCK; retVal = VAttachVolumeByName_r(ec, partition, name, mode); VOL_UNLOCK; - VATTACH_UNLOCK; return retVal; } Volume * VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) { - register Volume *vp; + register Volume *vp = NULL, *svp = NULL; int fd, n; struct afs_stat status; struct VolumeDiskHeader diskHeader; @@ -698,13 +1690,132 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) struct DiskPartition *partp; char path[64]; int isbusy = 0; + VolId volumeId; +#ifdef AFS_DEMAND_ATTACH_FS + VolumeStats stats_save; +#endif /* AFS_DEMAND_ATTACH_FS */ + *ec = 0; + + volumeId = VolumeNumber(name); + + if (!(partp = VGetPartition_r(partition, 0))) { + *ec = VNOVOL; + Log("VAttachVolume: Error getting partition (%s)\n", partition); + goto done; + } + if (programType == volumeUtility) { assert(VInit == 3); VLockPartition_r(partition); - } - if (programType == fileServer) { - vp = VGetVolume_r(ec, VolumeNumber(name)); + } else if (programType == fileServer) { +#ifdef AFS_DEMAND_ATTACH_FS + /* lookup the volume in the hash table */ + vp = VLookupVolume_r(ec, volumeId, NULL); + if (*ec) { + return NULL; + } + + if (vp) { + /* save any counters that are supposed to + * be monotonically increasing over the + * lifetime of the fileserver */ + memcpy(&stats_save, &vp->stats, sizeof(VolumeStats)); + } else { + memset(&stats_save, 0, sizeof(VolumeStats)); + } + + /* if there's something in the hash table, and it's not + * in the pre-attach state, then we may need to detach + * it before proceeding */ + if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) { + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); + + /* at this point state must be one of: + * UNATTACHED, + * ATTACHED, + * SHUTTING_DOWN, + * GOING_OFFLINE, + * SALVAGING, + * ERROR + */ + + if (vp->specialStatus == VBUSY) + isbusy = 1; + + /* if it's already attached, see if we can return it */ + if (V_attachState(vp) == VOL_STATE_ATTACHED) { + VGetVolumeByVp_r(ec, vp); + if (V_inUse(vp)) { + VCancelReservation_r(vp); + return vp; + } + + /* otherwise, we need to detach, and attempt to re-attach */ + VDetachVolume_r(ec, vp); + if (*ec) { + Log("VAttachVolume: Error detaching old volume instance (%s)\n", name); + } + } else { + /* if it isn't fully attached, delete from the hash tables, + and let the refcounter handle the rest */ + DeleteVolumeFromHashTable(vp); + DeleteVolumeFromVByPList_r(vp); + } + + VCancelReservation_r(vp); + vp = NULL; + } + + /* pre-attach volume if it hasn't been done yet */ + if (!vp || + (V_attachState(vp) == VOL_STATE_UNATTACHED) || + (V_attachState(vp) == VOL_STATE_ERROR)) { + svp = vp; + vp = VPreAttachVolumeById_r(ec, partp, vp, volumeId); + if (*ec) { + return NULL; + } + } + + assert(vp != NULL); + + /* handle pre-attach races + * + * multiple threads can race to pre-attach a volume, + * but we can't let them race beyond that + * + * our solution is to let the first thread to bring + * the volume into an exclusive state win; the other + * threads just wait until it finishes bringing the + * volume online, and then they do a vgetvolumebyvp + */ + if (svp && (svp != vp)) { + /* wait for other exclusive ops to finish */ + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); + + /* get a heavyweight ref, kill the lightweight ref, and return */ + VGetVolumeByVp_r(ec, vp); + VCancelReservation_r(vp); + return vp; + } + + /* at this point, we are chosen as the thread to do + * demand attachment for this volume. all other threads + * doing a getvolume on vp->hashid will block until we finish */ + + /* make sure any old header cache entries are invalidated + * before proceeding */ + FreeVolumeHeader(vp); + + VChangeState_r(vp, VOL_STATE_ATTACHING); + + /* restore any saved counters */ + memcpy(&vp->stats, &stats_save, sizeof(VolumeStats)); +#else /* AFS_DEMAND_ATTACH_FS */ + vp = VGetVolume_r(ec, volumeId); if (vp) { if (V_inUse(vp)) return vp; @@ -714,55 +1825,80 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) if (*ec) { Log("VAttachVolume: Error detaching volume (%s)\n", name); } + vp = NULL; } - } - - if (!(partp = VGetPartition_r(partition, 0))) { - *ec = VNOVOL; - Log("VAttachVolume: Error getting partition (%s)\n", partition); - goto done; +#endif /* AFS_DEMAND_ATTACH_FS */ } *ec = 0; strcpy(path, VPartitionPath(partp)); + + VOL_UNLOCK; + strcat(path, "/"); strcat(path, name); - VOL_UNLOCK; if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) { Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno); if (fd > -1) close(fd); - VOL_LOCK; *ec = VNOVOL; + VOL_LOCK; goto done; } n = read(fd, &diskHeader, sizeof(diskHeader)); close(fd); - VOL_LOCK; if (n != sizeof(diskHeader) || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) { Log("VAttachVolume: Error reading volume header %s\n", path); *ec = VSALVAGE; + VOL_LOCK; goto done; } if (diskHeader.stamp.version != VOLUMEHEADERVERSION) { Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path); *ec = VSALVAGE; + VOL_LOCK; goto done; } DiskToVolumeHeader(&iheader, &diskHeader); +#ifdef FSSYNC_BUILD_CLIENT if (programType == volumeUtility && mode != V_SECRETLY && mode != V_PEEK) { - if (FSYNC_askfs(iheader.id, partition, FSYNC_NEEDVOLUME, mode) - == FSYNC_DENIED) { + VOL_LOCK; + if (FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_NEEDVOLUME, mode, NULL) + != SYNC_OK) { Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id); *ec = VNOVOL; /* XXXX */ goto done; } + VOL_UNLOCK; + } +#endif + + if (!vp) { + vp = (Volume *) calloc(1, sizeof(Volume)); + assert(vp != NULL); + vp->device = partp->device; + vp->partition = partp; +#ifdef AFS_DEMAND_ATTACH_FS + assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ } - vp = attach2(ec, path, &iheader, partp, isbusy); + /* attach2 is entered without any locks, and returns + * with vol_glock_mutex held */ + vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode); + if (programType == volumeUtility && vp) { +#ifdef AFS_DEMAND_ATTACH_FS + /* for dafs, we should tell the fileserver, except for V_PEEK + * where we know it is not necessary */ + if (mode == V_PEEK) { + vp->needsPutBack = 0; + } else { + vp->needsPutBack = 1; + } +#else /* !AFS_DEMAND_ATTACH_FS */ /* duplicate computation in fssync.c about whether the server * takes the volume offline or not. If the volume isn't * offline, we must not return it when we detach the volume, @@ -772,6 +1908,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) vp->needsPutBack = 0; else vp->needsPutBack = 1; +#endif /* !AFS_DEMAND_ATTACH_FS */ } /* OK, there's a problem here, but one that I don't know how to * fix right now, and that I don't think should arise often. @@ -784,10 +1921,13 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) * for all of that to happen, but if it does, probably the right * fix is for the server to allow the return of readonly volumes * that it doesn't think are really checked out. */ +#ifdef FSSYNC_BUILD_CLIENT if (programType == volumeUtility && vp == NULL && mode != V_SECRETLY && mode != V_PEEK) { - FSYNC_askfs(iheader.id, partition, FSYNC_ON, 0); - } else if (programType == fileServer && vp) { + FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_ON, 0, NULL); + } else +#endif + if (programType == fileServer && vp) { V_needsCallback(vp) = 0; #ifdef notdef if (VInit >= 2 && V_BreakVolumeCallbacks) { @@ -795,7 +1935,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) (*V_BreakVolumeCallbacks) (V_id(vp)); } #endif - VUpdateVolume_r(ec, vp); + VUpdateVolume_r(ec, vp, 0); if (*ec) { Log("VAttachVolume: Error updating volume\n"); if (vp) @@ -803,7 +1943,8 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) goto done; } if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) { - /* This is a hack: by temporarily settint the incore +#ifndef AFS_DEMAND_ATTACH_FS + /* This is a hack: by temporarily setting the incore * dontSalvage flag ON, the volume will be put back on the * Update list (with dontSalvage OFF again). It will then * come back in N minutes with DONT_SALVAGE eventually @@ -812,6 +1953,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) * offline without DONT SALVAGE having been set also * eventually get it set */ V_dontSalvage(vp) = DONT_SALVAGE; +#endif /* !AFS_DEMAND_ATTACH_FS */ VAddToVolumeUpdateList_r(ec, vp); if (*ec) { Log("VAttachVolume: Error adding volume to update list\n"); @@ -828,25 +1970,196 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) if (programType == volumeUtility) { VUnlockPartition_r(partition); } - if (*ec) + if (*ec) { +#ifdef AFS_DEMAND_ATTACH_FS + if (vp) { + V_attachState(vp) = VOL_STATE_ERROR; + assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0); + } +#endif /* AFS_DEMAND_ATTACH_FS */ return NULL; - else + } else { return vp; + } } -private Volume * -attach2(Error * ec, char *path, register struct VolumeHeader * header, - struct DiskPartition * partp, int isbusy) +#ifdef AFS_DEMAND_ATTACH_FS +/* VAttachVolumeByVp_r + * + * finish attaching a volume that is + * in a less than fully attached state + */ +/* caller MUST hold a ref count on vp */ +static Volume * +VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode) { - register Volume *vp; + char name[VMAXPATHLEN]; + int fd, n, reserve = 0; + struct afs_stat status; + struct VolumeDiskHeader diskHeader; + struct VolumeHeader iheader; + struct DiskPartition *partp; + char path[64]; + int isbusy = 0; + VolId volumeId; + Volume * nvp; + VolumeStats stats_save; + *ec = 0; + + /* volume utility should never call AttachByVp */ + assert(programType == fileServer); + + volumeId = vp->hashid; + partp = vp->partition; + VolumeExternalName_r(volumeId, name, sizeof(name)); + + + /* if another thread is performing a blocking op, wait */ + VWaitExclusiveState_r(vp); + + memcpy(&stats_save, &vp->stats, sizeof(VolumeStats)); + + /* if it's already attached, see if we can return it */ + if (V_attachState(vp) == VOL_STATE_ATTACHED) { + VGetVolumeByVp_r(ec, vp); + if (V_inUse(vp)) { + return vp; + } else { + if (vp->specialStatus == VBUSY) + isbusy = 1; + VDetachVolume_r(ec, vp); + if (*ec) { + Log("VAttachVolume: Error detaching volume (%s)\n", name); + } + vp = NULL; + } + } + + /* pre-attach volume if it hasn't been done yet */ + if (!vp || + (V_attachState(vp) == VOL_STATE_UNATTACHED) || + (V_attachState(vp) == VOL_STATE_ERROR)) { + nvp = VPreAttachVolumeById_r(ec, partp, vp, volumeId); + if (*ec) { + return NULL; + } + if (nvp != vp) { + reserve = 1; + VCreateReservation_r(nvp); + vp = nvp; + } + } + + assert(vp != NULL); + VChangeState_r(vp, VOL_STATE_ATTACHING); + + /* restore monotonically increasing stats */ + memcpy(&vp->stats, &stats_save, sizeof(VolumeStats)); + + *ec = 0; + + + /* compute path to disk header, + * read in header, + * and verify magic and version stamps */ + strcpy(path, VPartitionPath(partp)); VOL_UNLOCK; - vp = (Volume *) calloc(1, sizeof(Volume)); - assert(vp != NULL); + strcat(path, "/"); + strcat(path, name); + if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) { + Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno); + if (fd > -1) + close(fd); + *ec = VNOVOL; + VOL_LOCK; + goto done; + } + n = read(fd, &diskHeader, sizeof(diskHeader)); + close(fd); + if (n != sizeof(diskHeader) + || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) { + Log("VAttachVolume: Error reading volume header %s\n", path); + *ec = VSALVAGE; + VOL_LOCK; + goto done; + } + if (diskHeader.stamp.version != VOLUMEHEADERVERSION) { + Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path); + *ec = VSALVAGE; + VOL_LOCK; + goto done; + } + + /* convert on-disk header format to in-memory header format */ + DiskToVolumeHeader(&iheader, &diskHeader); + + /* do volume attach + * + * NOTE: attach2 is entered without any locks, and returns + * with vol_glock_mutex held */ + vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode); + + if (*ec || vp == NULL) { + goto done; + } + + V_needsCallback(vp) = 0; + VUpdateVolume_r(ec, vp, 0); + if (*ec) { + Log("VAttachVolume: Error updating volume %u\n", vp->hashid); + VPutVolume_r(vp); + goto done; + } + if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) { +#ifndef AFS_DEMAND_ATTACH_FS + /* This is a hack: by temporarily setting the incore + * dontSalvage flag ON, the volume will be put back on the + * Update list (with dontSalvage OFF again). It will then + * come back in N minutes with DONT_SALVAGE eventually + * set. This is the way that volumes that have never had + * it set get it set; or that volumes that have been + * offline without DONT SALVAGE having been set also + * eventually get it set */ + V_dontSalvage(vp) = DONT_SALVAGE; +#endif /* !AFS_DEMAND_ATTACH_FS */ + VAddToVolumeUpdateList_r(ec, vp); + if (*ec) { + Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid); + if (vp) + VPutVolume_r(vp); + goto done; + } + } + if (LogLevel) + Log("VOnline: volume %u (%s) attached and online\n", V_id(vp), + V_name(vp)); + done: + if (reserve) { + VCancelReservation_r(nvp); + reserve = 0; + } + if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) { + if (vp && !IsErrorState(V_attachState(vp))) { + VChangeState_r(vp, VOL_STATE_ERROR); + } + return NULL; + } else { + return vp; + } +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +/* + * called without any locks held + * returns with vol_glock_mutex held + */ +private Volume * +attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header, + struct DiskPartition * partp, register Volume * vp, int isbusy, int mode) +{ vp->specialStatus = (byte) (isbusy ? VBUSY : 0); - vp->device = partp->device; - vp->partition = partp; IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent, header->largeVnodeIndex); IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent, @@ -857,8 +2170,15 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header, vp->shuttingDown = 0; vp->goingOffline = 0; vp->nUsers = 1; +#ifdef AFS_DEMAND_ATTACH_FS + vp->stats.last_attach = FT_ApproxTime(); + vp->stats.attaches++; +#endif VOL_LOCK; +#ifdef AFS_DEMAND_ATTACH_FS + IncUInt64(&VStats.attaches); +#endif vp->cacheCheck = ++VolumeCacheCheck; /* just in case this ever rolls over */ if (!vp->cacheCheck) @@ -866,13 +2186,74 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header, GetVolumeHeader(vp); VOL_UNLOCK; +#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT) + /* demand attach changes the V_PEEK mechanism + * + * we can now suck the current disk data structure over + * the fssync interface without going to disk + * + * (technically, we don't need to restrict this feature + * to demand attach fileservers. However, I'm trying + * to limit the number of common code changes) + */ + if (programType != fileServer && mode == V_PEEK) { + SYNC_response res; + res.payload.len = sizeof(VolumeDiskData); + res.payload.buf = &vp->header->diskstuff; + + if (FSYNC_VolOp(volumeId, + VPartitionPath(partp), + FSYNC_VOL_QUERY_HDR, + FSYNC_WHATEVER, + &res) == SYNC_OK) { + goto disk_header_loaded; + } + } +#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */ (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION); +#ifdef AFS_DEMAND_ATTACH_FS + /* update stats */ VOL_LOCK; + IncUInt64(&VStats.hdr_loads); + IncUInt64(&vp->stats.hdr_loads); + VOL_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + if (*ec) { Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec); } + + disk_header_loaded: + +#ifdef AFS_DEMAND_ATTACH_FS + if (!*ec) { + + /* check for pending volume operations */ + if (vp->pending_vol_op) { + /* see if the pending volume op requires exclusive access */ + if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) { + /* mark the volume down */ + *ec = VOFFLINE; + VChangeState_r(vp, VOL_STATE_UNATTACHED); + if (V_offlineMessage(vp)[0] == '\0') + strlcpy(V_offlineMessage(vp), + "A volume utility is running.", + sizeof(V_offlineMessage(vp))); + V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0'; + + /* check to see if we should set the specialStatus flag */ + if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) { + vp->specialStatus = VBUSY; + } + } + } + + V_attachFlags(vp) |= VOL_HDR_LOADED; + } +#endif /* AFS_DEMAND_ATTACH_FS */ + if (!*ec) { struct IndexFileHeader iHead; @@ -887,65 +2268,117 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header, V_stat_initialized(vp) = 1; } #endif /* OPENAFS_VOL_STATS */ - VOL_UNLOCK; + (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle, (char *)&iHead, sizeof(iHead), SMALLINDEXMAGIC, SMALLINDEXVERSION); - VOL_LOCK; + if (*ec) { Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec); } } + if (!*ec) { struct IndexFileHeader iHead; - VOL_UNLOCK; + (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle, (char *)&iHead, sizeof(iHead), LARGEINDEXMAGIC, LARGEINDEXVERSION); - VOL_LOCK; + if (*ec) { Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec); } } + #ifdef AFS_NAMEI_ENV if (!*ec) { struct versionStamp stamp; - VOL_UNLOCK; + (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp, sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION); - VOL_LOCK; + if (*ec) { Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec); } } -#endif +#endif /* AFS_NAMEI_ENV */ + +#if defined(AFS_DEMAND_ATTACH_FS) + if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) { + VOL_LOCK; + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + vp->nUsers = 0; + *ec = VSALVAGING; + } else { + Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec); + FreeVolume(vp); + *ec = VSALVAGE; + } + return NULL; + } else if (*ec) { + /* volume operation in progress */ + VOL_LOCK; + return NULL; + } +#else /* AFS_DEMAND_ATTACH_FS */ if (*ec) { Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec); + VOL_LOCK; FreeVolume(vp); return NULL; } +#endif /* AFS_DEMAND_ATTACH_FS */ + if (V_needsSalvaged(vp)) { if (vp->specialStatus) vp->specialStatus = 0; - Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path); - *ec = VSALVAGE; + VOL_LOCK; +#if defined(AFS_DEMAND_ATTACH_FS) + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER); + vp->nUsers = 0; + *ec = VSALVAGING; + } else { + Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path); + FreeVolume(vp); + *ec = VSALVAGE; + } +#else /* AFS_DEMAND_ATTACH_FS */ FreeVolume(vp); + *ec = VSALVAGE; +#endif /* AFS_DEMAND_ATTACH_FS */ return NULL; } + + VOL_LOCK; if (programType == fileServer) { #ifndef FAST_RESTART if (V_inUse(vp) && VolumeWriteable(vp)) { if (!V_needsSalvaged(vp)) { V_needsSalvaged(vp) = 1; - VUpdateVolume_r(ec, vp); + VUpdateVolume_r(ec, vp, 0); } - FreeVolume(vp); +#if defined(AFS_DEMAND_ATTACH_FS) + VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER); + vp->nUsers = 0; + *ec = VSALVAGING; +#else /* AFS_DEMAND_ATTACH_FS */ Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path); + FreeVolume(vp); *ec = VSALVAGE; +#endif /* AFS_DEMAND_ATTACH_FS */ return NULL; } #endif /* FAST_RESTART */ + if (V_destroyMe(vp) == DESTROY_ME) { +#if defined(AFS_DEMAND_ATTACH_FS) + /* schedule a salvage so the volume goes away on disk */ + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + VChangeState_r(vp, VOL_STATE_ERROR); + vp->nUsers = 0; +#endif /* AFS_DEMAND_ATTACH_FS */ FreeVolume(vp); Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path); *ec = VNOVOL; @@ -953,18 +2386,21 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header, } } - AddVolumeToHashTable(vp, V_id(vp)); vp->nextVnodeUnique = V_uniquifier(vp); vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL; #ifndef BITMAP_LATER if (programType == fileServer && VolumeWriteable(vp)) { int i; for (i = 0; i < nVNODECLASSES; i++) { - VOL_UNLOCK; - GetBitmap(ec, vp, i); - VOL_LOCK; + VGetBitmap_r(ec, vp, i); if (*ec) { +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + vp->nUsers = 0; + *ec = VSALVAGING; +#else /* AFS_DEMAND_ATTACH_FS */ FreeVolume(vp); +#endif /* AFS_DEMAND_ATTACH_FS */ Log("VAttachVolume: error getting bitmap for volume (%s)\n", path); return NULL; @@ -982,6 +2418,12 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header, } } + AddVolumeToHashTable(vp, V_id(vp)); +#ifdef AFS_DEMAND_ATTACH_FS + AddVolumeToVByPList_r(vp); + VLRU_Add_r(vp); + VChangeState_r(vp, VOL_STATE_ATTACHED); +#endif return vp; } @@ -994,11 +2436,9 @@ Volume * VAttachVolume(Error * ec, VolumeId volumeId, int mode) { Volume *retVal; - VATTACH_LOCK; VOL_LOCK; retVal = VAttachVolume_r(ec, volumeId, mode); VOL_UNLOCK; - VATTACH_UNLOCK; return retVal; } @@ -1028,21 +2468,39 @@ VAttachVolume_r(Error * ec, VolumeId volumeId, int mode) * we still guarantee we won't context swap, but the ref count won't be * incremented (otherwise we'd violate the invariant). */ +/* NOTE: with the demand attach fileserver extensions, the global lock + * is dropped within VHold */ +#ifdef AFS_DEMAND_ATTACH_FS static int VHold_r(register Volume * vp) { Error error; - if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { - VolumeReplacements++; - ReadHeader(&error, V_diskDataHandle(vp), (char *)&V_disk(vp), - sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION); - if (error) - return error; + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); + + LoadVolumeHeader(&error, vp); + if (error) { + VCancelReservation_r(vp); + return error; } vp->nUsers++; + VCancelReservation_r(vp); + return 0; +} +#else /* AFS_DEMAND_ATTACH_FS */ +static int +VHold_r(register Volume * vp) +{ + Error error; + + LoadVolumeHeader(&error, vp); + if (error) + return error; + vp->nUsers++; return 0; } +#endif /* AFS_DEMAND_ATTACH_FS */ static int VHold(register Volume * vp) @@ -1054,59 +2512,26 @@ VHold(register Volume * vp) return retVal; } -void -VTakeOffline_r(register Volume * vp) -{ - assert(vp->nUsers > 0); - assert(programType == fileServer); - vp->goingOffline = 1; - V_needsSalvaged(vp) = 1; -} -void -VTakeOffline(register Volume * vp) -{ - VOL_LOCK; - VTakeOffline_r(vp); - VOL_UNLOCK; -} +/***************************************************/ +/* get and put volume routines */ +/***************************************************/ void VPutVolume_r(register Volume * vp) { assert(--vp->nUsers >= 0); if (vp->nUsers == 0) { + VCheckOffline(vp); ReleaseVolumeHeader(vp->header); - if (vp->goingOffline) { - Error error; - assert(programType == fileServer); - vp->goingOffline = 0; - V_inUse(vp) = 0; - VUpdateVolume_r(&error, vp); - VCloseVolumeHandles_r(vp); - if (LogLevel) { - Log("VOffline: Volume %u (%s) is now offline", V_id(vp), - V_name(vp)); - if (V_offlineMessage(vp)[0]) - Log(" (%s)", V_offlineMessage(vp)); - Log("\n"); - } -#ifdef AFS_PTHREAD_ENV - assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); -#else /* AFS_PTHREAD_ENV */ - LWP_NoYieldSignal(VPutVolume); -#endif /* AFS_PTHREAD_ENV */ - } - if (vp->shuttingDown) { - VReleaseVolumeHandles_r(vp); - FreeVolume(vp); - if (programType == fileServer) -#ifdef AFS_PTHREAD_ENV - assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); -#else /* AFS_PTHREAD_ENV */ - LWP_NoYieldSignal(VPutVolume); -#endif /* AFS_PTHREAD_ENV */ +#ifdef AFS_DEMAND_ATTACH_FS + if (!VCheckDetach(vp)) { + VCheckSalvage(vp); + VCheckFree(vp); } +#else /* AFS_DEMAND_ATTACH_FS */ + VCheckDetach(vp); +#endif /* AFS_DEMAND_ATTACH_FS */ } } @@ -1118,15 +2543,16 @@ VPutVolume(register Volume * vp) VOL_UNLOCK; } + /* Get a pointer to an attached volume. The pointer is returned regardless of whether or not the volume is in service or on/off line. An error code, however, is returned with an indication of the volume's status */ Volume * -VGetVolume(Error * ec, VolId volumeId) +VGetVolume(Error * ec, Error * client_ec, VolId volumeId) { Volume *retVal; VOL_LOCK; - retVal = VGetVolume_r(ec, volumeId); + retVal = GetVolume(ec, client_ec, volumeId, NULL, 0); VOL_UNLOCK; return retVal; } @@ -1134,22 +2560,69 @@ VGetVolume(Error * ec, VolId volumeId) Volume * VGetVolume_r(Error * ec, VolId volumeId) { - Volume *vp; - unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V4 = 0, V5 = 0, V6 = + return GetVolume(ec, NULL, volumeId, NULL, 0); +} + +/* try to get a volume we've previously looked up */ +/* for demand attach fs, caller MUST NOT hold a ref count on vp */ +Volume * +VGetVolumeByVp_r(Error * ec, Volume * vp) +{ + return GetVolume(ec, NULL, vp->hashid, vp, 0); +} + +/* private interface for getting a volume handle + * volumeId must be provided. + * hint is an optional parameter to speed up hash lookups + * flags is not used at this time + */ +/* for demand attach fs, caller MUST NOT hold a ref count on hint */ +static Volume * +GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags) +{ + Volume *vp = hint; + /* pull this profiling/debugging code out of regular builds */ +#ifdef notdef +#define VGET_CTR_INC(x) x++ + unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 = 0, V7 = 0, V8 = 0, V9 = 0; unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0; +#else +#define VGET_CTR_INC(x) +#endif + +#ifdef AFS_DEMAND_ATTACH_FS + Volume *avp, * rvp = hint; + + if (rvp) { + VCreateReservation_r(rvp); + } +#endif /* AFS_DEMAND_ATTACH_FS */ for (;;) { *ec = 0; - V0++; - for (vp = VolumeHashTable[VOLUME_HASH(volumeId)]; - vp && vp->hashid != volumeId; vp = vp->hashNext) - Vlooks++; + if (client_ec) + *client_ec = 0; + VGET_CTR_INC(V0); + + vp = VLookupVolume_r(ec, volumeId, vp); + if (*ec) { + vp = NULL; + break; + } + +#ifdef AFS_DEMAND_ATTACH_FS + if (rvp && (rvp != vp)) { + /* break reservation on old vp */ + VCancelReservation_r(rvp); + rvp = NULL; + } +#endif /* AFS_DEMAND_ATTACH_FS */ if (!vp) { - V1++; + VGET_CTR_INC(V1); if (VInit < 2) { - V2++; + VGET_CTR_INC(V2); /* Until we have reached an initialization level of 2 * we don't know whether this volume exists or not. * We can't sleep and retry later because before a volume @@ -1164,99 +2637,255 @@ VGetVolume_r(Error * ec, VolId volumeId) break; } - V3++; - VolumeGets++; - if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { - V5++; - VolumeReplacements++; - ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), - sizeof(V_disk(vp)), VOLUMEINFOMAGIC, - VOLUMEINFOVERSION); - if (*ec) { - V6++; - /* Only log the error if it was a totally unexpected error. Simply - * a missing inode is likely to be caused by the volume being deleted */ - if (errno != ENXIO || LogLevel) - Log("Volume %u: couldn't reread volume header\n", - vp->hashid); - FreeVolume(vp); - vp = NULL; - break; - } + VGET_CTR_INC(V3); + IncUInt64(&VStats.hdr_gets); + +#ifdef AFS_DEMAND_ATTACH_FS + /* block if someone else is performing an exclusive op on this volume */ + if (rvp != vp) { + rvp = vp; + VCreateReservation_r(rvp); } - V7++; - if (vp->shuttingDown) { - V8++; + VWaitExclusiveState_r(vp); + + /* short circuit with VNOVOL in the following circumstances: + * + * VOL_STATE_ERROR + * VOL_STATE_SHUTTING_DOWN + */ + if ((V_attachState(vp) == VOL_STATE_ERROR) || + (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) { *ec = VNOVOL; vp = NULL; break; } + + /* allowable states: + * UNATTACHED + * PREATTACHED + * ATTACHED + * GOING_OFFLINE + * SALVAGING + */ + + if (vp->salvage.requested) { + VUpdateSalvagePriority_r(vp); + } + + if (V_attachState(vp) == VOL_STATE_PREATTACHED) { + avp = VAttachVolumeByVp_r(ec, vp, 0); + if (avp) { + if (vp != avp) { + /* VAttachVolumeByVp_r can return a pointer + * != the vp passed to it under certain + * conditions; make sure we don't leak + * reservations if that happens */ + vp = avp; + VCancelReservation_r(rvp); + rvp = avp; + VCreateReservation_r(rvp); + } + VPutVolume_r(avp); + } + if (*ec) { + int endloop = 0; + switch (*ec) { + case VSALVAGING: + break; + case VOFFLINE: + if (!vp->pending_vol_op) { + endloop = 1; + } + break; + default: + *ec = VNOVOL; + endloop = 1; + } + if (endloop) { + vp = NULL; + break; + } + } + } + + if ((V_attachState(vp) == VOL_STATE_SALVAGING) || + (*ec == VSALVAGING)) { + if (client_ec) { + /* see CheckVnode() in afsfileprocs.c for an explanation + * of this error code logic */ + afs_uint32 now = FT_ApproxTime(); + if ((vp->stats.last_salvage + (10 * 60)) >= now) { + *client_ec = VBUSY; + } else { + *client_ec = VRESTARTING; + } + } + *ec = VSALVAGING; + vp = NULL; + break; + } + + if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) { + if (client_ec) { + /* see CheckVnode() in afsfileprocs.c for an explanation + * of this error code logic */ + afs_uint32 now = FT_ApproxTime(); + if ((vp->stats.last_vol_op + (10 * 60)) >= now) { + *client_ec = VBUSY; + } else { + *client_ec = VRESTARTING; + } + } + *ec = VOFFLINE; + vp = NULL; + break; + } + + if (V_attachState(vp) == VOL_STATE_UNATTACHED) { + *ec = VOFFLINE; + vp = NULL; + break; + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + LoadVolumeHeader(ec, vp); + if (*ec) { + VGET_CTR_INC(V6); + /* Only log the error if it was a totally unexpected error. Simply + * a missing inode is likely to be caused by the volume being deleted */ + if (errno != ENXIO || LogLevel) + Log("Volume %u: couldn't reread volume header\n", + vp->hashid); +#ifdef AFS_DEMAND_ATTACH_FS + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + *ec = VSALVAGING; + } else { + FreeVolume(vp); + vp = NULL; + } +#else /* AFS_DEMAND_ATTACH_FS */ + FreeVolume(vp); + vp = NULL; +#endif /* AFS_DEMAND_ATTACH_FS */ + break; + } + + VGET_CTR_INC(V7); + if (vp->shuttingDown) { + VGET_CTR_INC(V8); + *ec = VNOVOL; + vp = NULL; + break; + } + if (programType == fileServer) { - V9++; + VGET_CTR_INC(V9); if (vp->goingOffline) { - V10++; -#ifdef AFS_PTHREAD_ENV - pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex); + VGET_CTR_INC(V10); +#ifdef AFS_DEMAND_ATTACH_FS + /* wait for the volume to go offline */ + if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) { + VWaitStateChange_r(vp); + } +#elif defined(AFS_PTHREAD_ENV) + assert(pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex) == 0); #else /* AFS_PTHREAD_ENV */ LWP_WaitProcess(VPutVolume); #endif /* AFS_PTHREAD_ENV */ continue; } if (vp->specialStatus) { - V11++; + VGET_CTR_INC(V11); *ec = vp->specialStatus; } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) { - V12++; + VGET_CTR_INC(V12); *ec = VNOVOL; } else if (V_inUse(vp) == 0) { - V13++; + VGET_CTR_INC(V13); *ec = VOFFLINE; } else { - V14++; + VGET_CTR_INC(V14); } } break; } - V15++; + VGET_CTR_INC(V15); + +#ifdef AFS_DEMAND_ATTACH_FS /* if no error, bump nUsers */ - if (vp) + if (vp) { vp->nUsers++; + VLRU_UpdateAccess_r(vp); + } + if (rvp) { + VCancelReservation_r(rvp); + rvp = NULL; + } + if (client_ec && !*client_ec) { + *client_ec = *ec; + } +#else /* AFS_DEMAND_ATTACH_FS */ + /* if no error, bump nUsers */ + if (vp) { + vp->nUsers++; + } + if (client_ec) { + *client_ec = *ec; + } +#endif /* AFS_DEMAND_ATTACH_FS */ assert(vp || *ec); return vp; } -/* For both VForceOffline and VOffline, we close all relevant handles. - * For VOffline, if we re-attach the volume, the files may possible be - * different than before. - */ -static void -VReleaseVolumeHandles_r(Volume * vp) +/***************************************************/ +/* Volume offline/detach routines */ +/***************************************************/ + +/* caller MUST hold a heavyweight ref on vp */ +#ifdef AFS_DEMAND_ATTACH_FS +void +VTakeOffline_r(register Volume * vp) { - DFlushVolume(V_id(vp)); - VReleaseVnodeFiles_r(vp); + assert(vp->nUsers > 0); + assert(programType == fileServer); - /* Too time consuming and unnecessary for the volserver */ - if (programType != volumeUtility) { - IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); - IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); - IH_CONDSYNC(vp->diskDataHandle); -#ifdef AFS_NT40_ENV - IH_CONDSYNC(vp->linkHandle); -#endif /* AFS_NT40_ENV */ - } + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); - IH_RELEASE(vp->vnodeIndex[vLarge].handle); - IH_RELEASE(vp->vnodeIndex[vSmall].handle); - IH_RELEASE(vp->diskDataHandle); - IH_RELEASE(vp->linkHandle); + vp->goingOffline = 1; + V_needsSalvaged(vp) = 1; + + VRequestSalvage_r(vp, SALVSYNC_ERROR, 0); + VCancelReservation_r(vp); +} +#else /* AFS_DEMAND_ATTACH_FS */ +void +VTakeOffline_r(register Volume * vp) +{ + assert(vp->nUsers > 0); + assert(programType == fileServer); + + vp->goingOffline = 1; + V_needsSalvaged(vp) = 1; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +void +VTakeOffline(register Volume * vp) +{ + VOL_LOCK; + VTakeOffline_r(vp); + VOL_UNLOCK; } /* Force the volume offline, set the salvage flag. No further references to * the volume through the volume package will be honored. */ +/* for demand attach, caller MUST hold ref count on vp */ void -VForceOffline_r(Volume * vp) +VForceOffline_r(Volume * vp, int flags) { Error error; if (!V_inUse(vp)) @@ -1267,7 +2896,17 @@ VForceOffline_r(Volume * vp) V_inUse(vp) = 0; vp->goingOffline = 0; V_needsSalvaged(vp) = 1; - VUpdateVolume_r(&error, vp); + if (!(flags & VOL_FORCEOFF_NOUPDATE)) { + VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT | VOL_UPDATE_NOFORCEOFF); + } +#ifdef AFS_DEMAND_ATTACH_FS +#ifdef SALVSYNC_BUILD_CLIENT + if (programType == fileServer) { + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + } +#endif + VChangeState_r(vp, VOL_STATE_ERROR); +#endif /* AFS_DEMAND_ATTACH_FS */ #ifdef AFS_PTHREAD_ENV assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); #else /* AFS_PTHREAD_ENV */ @@ -1275,14 +2914,13 @@ VForceOffline_r(Volume * vp) #endif /* AFS_PTHREAD_ENV */ VReleaseVolumeHandles_r(vp); - } void VForceOffline(Volume * vp) { VOL_LOCK; - VForceOffline_r(vp); + VForceOffline_r(vp, 0); VOL_UNLOCK; } @@ -1295,6 +2933,7 @@ VOffline_r(Volume * vp, char *message) { Error error; VolumeId vid = V_id(vp); + assert(programType != volumeUtility); if (!V_inUse(vp)) { VPutVolume_r(vp); @@ -1303,11 +2942,24 @@ VOffline_r(Volume * vp, char *message) if (V_offlineMessage(vp)[0] == '\0') strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp))); V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0'; + vp->goingOffline = 1; +#ifdef AFS_DEMAND_ATTACH_FS + VChangeState_r(vp, VOL_STATE_GOING_OFFLINE); + VCreateReservation_r(vp); + VPutVolume_r(vp); + + /* wait for the volume to go offline */ + if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) { + VWaitStateChange_r(vp); + } + VCancelReservation_r(vp); +#else /* AFS_DEMAND_ATTACH_FS */ VPutVolume_r(vp); vp = VGetVolume_r(&error, vid); /* Wait for it to go offline */ if (vp) /* In case it was reattached... */ VPutVolume_r(vp); +#endif /* AFS_DEMAND_ATTACH_FS */ } void @@ -1318,36 +2970,12 @@ VOffline(Volume * vp, char *message) VOL_UNLOCK; } -/* For VDetachVolume, we close all cached file descriptors, but keep - * the Inode handles in case we need to read from a busy volume. - */ -static void -VCloseVolumeHandles_r(Volume * vp) -{ - DFlushVolume(V_id(vp)); - VCloseVnodeFiles_r(vp); - - /* Too time consuming and unnecessary for the volserver */ - if (programType != volumeUtility) { - IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); - IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); - IH_CONDSYNC(vp->diskDataHandle); -#ifdef AFS_NT40_ENV - IH_CONDSYNC(vp->linkHandle); -#endif /* AFS_NT40_ENV */ - } - - IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle); - IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle); - IH_REALLYCLOSE(vp->diskDataHandle); - IH_REALLYCLOSE(vp->linkHandle); -} - /* This gets used for the most part by utility routines that don't want * to keep all the volume headers around. Generally, the file server won't * call this routine, because then the offline message in the volume header - * (or other information) will still be available to clients. For NAMEI, also - * close the file handles. + * (or other information) won't be available to clients. For NAMEI, also + * close the file handles. However, the fileserver does call this during + * an attach following a volume operation. */ void VDetachVolume_r(Error * ec, Volume * vp) @@ -1365,9 +2993,18 @@ VDetachVolume_r(Error * ec, Volume * vp) volume = V_id(vp); DeleteVolumeFromHashTable(vp); vp->shuttingDown = 1; +#ifdef AFS_DEMAND_ATTACH_FS + DeleteVolumeFromVByPList_r(vp); + VLRU_Delete_r(vp); + VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN); +#endif /* AFS_DEMAND_ATTACH_FS */ VPutVolume_r(vp); /* Will be detached sometime in the future--this is OK since volume is offline */ + /* XXX the following code should really be moved to VCheckDetach() since the volume + * is not technically detached until the refcounts reach zero + */ +#ifdef FSSYNC_BUILD_CLIENT if (programType == volumeUtility && notifyServer) { /* * Note: The server is not notified in the case of a bogus volume @@ -1378,19 +3015,26 @@ VDetachVolume_r(Error * ec, Volume * vp) * would be two instances of the same volume, one of them bogus, * which the file server would attempt to put on line */ - if (useDone) + if (useDone) { /* don't put online */ - FSYNC_askfs(volume, tpartp->name, FSYNC_DONE, 0); - else { + FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_DONE, 0, NULL); + } else { /* fs can use it again */ - FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0); + FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_ON, 0, NULL); + + /* XXX this code path is only hit by volume utilities, thus + * V_BreakVolumeCallbacks will always be NULL. if we really + * want to break callbacks in this path we need to use FSYNC_VolOp() */ +#ifdef notdef /* Dettaching it so break all callbacks on it */ if (V_BreakVolumeCallbacks) { Log("volume %u detached; breaking all call backs\n", volume); (*V_BreakVolumeCallbacks) (volume); } +#endif } } +#endif /* FSSYNC_BUILD_CLIENT */ } void @@ -1402,20 +3046,848 @@ VDetachVolume(Error * ec, Volume * vp) } -VnodeId -VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex - *index) +/***************************************************/ +/* Volume fd/inode handle closing routines */ +/***************************************************/ + +/* For VDetachVolume, we close all cached file descriptors, but keep + * the Inode handles in case we need to read from a busy volume. + */ +/* for demand attach, caller MUST hold ref count on vp */ +static void +VCloseVolumeHandles_r(Volume * vp) { - register byte *bp, *ep; +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; + + state_save = VChangeState_r(vp, VOL_STATE_OFFLINING); +#endif + + /* demand attach fs + * + * XXX need to investigate whether we can perform + * DFlushVolume outside of vol_glock_mutex... + * + * VCloseVnodeFiles_r drops the glock internally */ + DFlushVolume(V_id(vp)); + VCloseVnodeFiles_r(vp); + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_UNLOCK; +#endif + + /* Too time consuming and unnecessary for the volserver */ + if (programType != volumeUtility) { + IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); + IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); + IH_CONDSYNC(vp->diskDataHandle); +#ifdef AFS_NT40_ENV + IH_CONDSYNC(vp->linkHandle); +#endif /* AFS_NT40_ENV */ + } + + IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle); + IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle); + IH_REALLYCLOSE(vp->diskDataHandle); + IH_REALLYCLOSE(vp->linkHandle); + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VChangeState_r(vp, state_save); +#endif +} + +/* For both VForceOffline and VOffline, we close all relevant handles. + * For VOffline, if we re-attach the volume, the files may possible be + * different than before. + */ +/* for demand attach, caller MUST hold a ref count on vp */ +static void +VReleaseVolumeHandles_r(Volume * vp) +{ +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; + + state_save = VChangeState_r(vp, VOL_STATE_DETACHING); +#endif + + /* XXX need to investigate whether we can perform + * DFlushVolume outside of vol_glock_mutex... */ + DFlushVolume(V_id(vp)); + + VReleaseVnodeFiles_r(vp); /* releases the glock internally */ + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_UNLOCK; +#endif + + /* Too time consuming and unnecessary for the volserver */ + if (programType != volumeUtility) { + IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); + IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); + IH_CONDSYNC(vp->diskDataHandle); +#ifdef AFS_NT40_ENV + IH_CONDSYNC(vp->linkHandle); +#endif /* AFS_NT40_ENV */ + } + + IH_RELEASE(vp->vnodeIndex[vLarge].handle); + IH_RELEASE(vp->vnodeIndex[vSmall].handle); + IH_RELEASE(vp->diskDataHandle); + IH_RELEASE(vp->linkHandle); + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VChangeState_r(vp, state_save); +#endif +} + + +/***************************************************/ +/* Volume write and fsync routines */ +/***************************************************/ + +void +VUpdateVolume_r(Error * ec, Volume * vp, int flags) +{ +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; + + if (flags & VOL_UPDATE_WAIT) { + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); + } +#endif + *ec = 0; + if (programType == fileServer) + V_uniquifier(vp) = + (V_inUse(vp) ? V_nextVnodeUnique(vp) + + 200 : V_nextVnodeUnique(vp)); + +#ifdef AFS_DEMAND_ATTACH_FS + state_save = VChangeState_r(vp, VOL_STATE_UPDATING); + VOL_UNLOCK; +#endif + + WriteVolumeHeader_r(ec, vp); + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VChangeState_r(vp, state_save); + if (flags & VOL_UPDATE_WAIT) { + VCancelReservation_r(vp); + } +#endif + + if (*ec) { + Log("VUpdateVolume: error updating volume header, volume %u (%s)\n", + V_id(vp), V_name(vp)); + /* try to update on-disk header, + * while preventing infinite recursion */ + if (!(flags & VOL_UPDATE_NOFORCEOFF)) { + VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE); + } + } +} + +void +VUpdateVolume(Error * ec, Volume * vp) +{ + VOL_LOCK; + VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT); + VOL_UNLOCK; +} + +void +VSyncVolume_r(Error * ec, Volume * vp, int flags) +{ + FdHandle_t *fdP; + int code; +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; +#endif + + if (flags & VOL_SYNC_WAIT) { + VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT); + } else { + VUpdateVolume_r(ec, vp, 0); + } + if (!*ec) { +#ifdef AFS_DEMAND_ATTACH_FS + state_save = VChangeState_r(vp, VOL_STATE_UPDATING); + VOL_UNLOCK; +#endif + fdP = IH_OPEN(V_diskDataHandle(vp)); + assert(fdP != NULL); + code = FDH_SYNC(fdP); + assert(code == 0); + FDH_CLOSE(fdP); +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; + VChangeState_r(vp, state_save); +#endif + } +} + +void +VSyncVolume(Error * ec, Volume * vp) +{ + VOL_LOCK; + VSyncVolume_r(ec, vp, VOL_SYNC_WAIT); + VOL_UNLOCK; +} + + +/***************************************************/ +/* Volume dealloaction routines */ +/***************************************************/ + +#ifdef AFS_DEMAND_ATTACH_FS +static void +FreeVolume(Volume * vp) +{ + /* free the heap space, iff it's safe. + * otherwise, pull it out of the hash table, so it + * will get deallocated when all refs to it go away */ + if (!VCheckFree(vp)) { + DeleteVolumeFromHashTable(vp); + DeleteVolumeFromVByPList_r(vp); + + /* make sure we invalidate the header cache entry */ + FreeVolumeHeader(vp); + } +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +static void +ReallyFreeVolume(Volume * vp) +{ + int i; + if (!vp) + return; +#ifdef AFS_DEMAND_ATTACH_FS + /* debug */ + VChangeState_r(vp, VOL_STATE_FREED); + if (vp->pending_vol_op) + free(vp->pending_vol_op); +#endif /* AFS_DEMAND_ATTACH_FS */ + for (i = 0; i < nVNODECLASSES; i++) + if (vp->vnodeIndex[i].bitmap) + free(vp->vnodeIndex[i].bitmap); + FreeVolumeHeader(vp); +#ifndef AFS_DEMAND_ATTACH_FS + DeleteVolumeFromHashTable(vp); +#endif /* AFS_DEMAND_ATTACH_FS */ + free(vp); +} + +/* check to see if we should shutdown this volume + * returns 1 if volume was freed, 0 otherwise */ +#ifdef AFS_DEMAND_ATTACH_FS +static int +VCheckDetach(register Volume * vp) +{ + int ret = 0; + + if (vp->nUsers || vp->nWaiters) + return ret; + + if (vp->shuttingDown) { + ret = 1; + VReleaseVolumeHandles_r(vp); + VCheckSalvage(vp); + ReallyFreeVolume(vp); + if (programType == fileServer) { + assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); + } + } + return ret; +} +#else /* AFS_DEMAND_ATTACH_FS */ +static int +VCheckDetach(register Volume * vp) +{ + int ret = 0; + + if (vp->nUsers) + return ret; + + if (vp->shuttingDown) { + ret = 1; + VReleaseVolumeHandles_r(vp); + ReallyFreeVolume(vp); + if (programType == fileServer) { +#if defined(AFS_PTHREAD_ENV) + assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); +#else /* AFS_PTHREAD_ENV */ + LWP_NoYieldSignal(VPutVolume); +#endif /* AFS_PTHREAD_ENV */ + } + } + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +/* check to see if we should offline this volume + * return 1 if volume went offline, 0 otherwise */ +#ifdef AFS_DEMAND_ATTACH_FS +static int +VCheckOffline(register Volume * vp) +{ + Volume * rvp = NULL; + int ret = 0; + + if (vp->goingOffline && !vp->nUsers) { + Error error; + assert(programType == fileServer); + assert((V_attachState(vp) != VOL_STATE_ATTACHED) && + (V_attachState(vp) != VOL_STATE_FREED) && + (V_attachState(vp) != VOL_STATE_PREATTACHED) && + (V_attachState(vp) != VOL_STATE_UNATTACHED)); + + /* valid states: + * + * VOL_STATE_GOING_OFFLINE + * VOL_STATE_SHUTTING_DOWN + * IsErrorState(V_attachState(vp)) + * IsExclusiveState(V_attachState(vp)) + */ + + VCreateReservation_r(vp); + VChangeState_r(vp, VOL_STATE_OFFLINING); + + ret = 1; + /* must clear the goingOffline flag before we drop the glock */ + vp->goingOffline = 0; + V_inUse(vp) = 0; + + VLRU_Delete_r(vp); + + /* perform async operations */ + VUpdateVolume_r(&error, vp, 0); + VCloseVolumeHandles_r(vp); + + /* invalidate the volume header cache entry */ + FreeVolumeHeader(vp); + + if (LogLevel) { + Log("VOffline: Volume %u (%s) is now offline", V_id(vp), + V_name(vp)); + if (V_offlineMessage(vp)[0]) + Log(" (%s)", V_offlineMessage(vp)); + Log("\n"); + } + + /* if nothing changed state to error or salvaging, + * drop state to unattached */ + if (!IsErrorState(V_attachState(vp))) { + VChangeState_r(vp, VOL_STATE_UNATTACHED); + } + VCancelReservation_r(vp); + } + return ret; +} +#else /* AFS_DEMAND_ATTACH_FS */ +static int +VCheckOffline(register Volume * vp) +{ + Volume * rvp = NULL; + int ret = 0; + + if (vp->goingOffline && !vp->nUsers) { + Error error; + assert(programType == fileServer); + + ret = 1; + vp->goingOffline = 0; + V_inUse(vp) = 0; + VUpdateVolume_r(&error, vp, 0); + VCloseVolumeHandles_r(vp); + FreeVolumeHeader(vp); + if (LogLevel) { + Log("VOffline: Volume %u (%s) is now offline", V_id(vp), + V_name(vp)); + if (V_offlineMessage(vp)[0]) + Log(" (%s)", V_offlineMessage(vp)); + Log("\n"); + } +#ifdef AFS_PTHREAD_ENV + assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0); +#else /* AFS_PTHREAD_ENV */ + LWP_NoYieldSignal(VPutVolume); +#endif /* AFS_PTHREAD_ENV */ + } + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +/***************************************************/ +/* demand attach fs ref counting routines */ +/***************************************************/ + +#ifdef AFS_DEMAND_ATTACH_FS +/* the following two functions handle reference counting for + * asynchronous operations on volume structs. + * + * their purpose is to prevent a VDetachVolume or VShutdown + * from free()ing the Volume struct during an async i/o op */ + +/* register with the async volume op ref counter */ +static void +VCreateReservation_r(Volume * vp) +{ + vp->nWaiters++; +} + +/* unregister with the async volume op ref counter */ +static void +VCancelReservation_r(Volume * vp) +{ + assert(--vp->nWaiters >= 0); + if (vp->nWaiters == 0) { + VCheckOffline(vp); + if (!VCheckDetach(vp)) { + VCheckSalvage(vp); + VCheckFree(vp); + } + } +} + +/* check to see if we should free this volume now + * return 1 if volume was freed, 0 otherwise */ +static int +VCheckFree(Volume * vp) +{ + int ret = 0; + if ((vp->nUsers == 0) && + (vp->nWaiters == 0) && + !(V_attachFlags(vp) & (VOL_IN_HASH | + VOL_ON_VBYP_LIST | + VOL_IS_BUSY | + VOL_ON_VLRU))) { + ReallyFreeVolume(vp); + ret = 1; + } + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* online volume operations routines */ +/***************************************************/ + +#ifdef AFS_DEMAND_ATTACH_FS +int +VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) +{ + FSSYNC_VolOp_info * info; + + /* attach a vol op info node to the volume struct */ + info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info)); + assert(info != NULL); + memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info)); + vp->pending_vol_op = info; + + /* update stats */ + vp->stats.last_vol_op = FT_ApproxTime(); + vp->stats.vol_ops++; + IncUInt64(&VStats.vol_ops); + + return 0; +} + +int +VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) +{ + if (vp->pending_vol_op) { + free(vp->pending_vol_op); + vp->pending_vol_op = NULL; + } + return 0; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +int +VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) +{ + return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && + (vopinfo->com.reason == V_READONLY || + (!VolumeWriteable(vp) && + (vopinfo->com.reason == V_CLONE || + vopinfo->com.reason == V_DUMP)))); +} + +int +VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) +{ + return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && + (vopinfo->com.reason == V_CLONE || + vopinfo->com.reason == V_DUMP)); +} + + +/***************************************************/ +/* online salvager routines */ +/***************************************************/ +#if defined(AFS_DEMAND_ATTACH_FS) +#define SALVAGE_PRIO_UPDATE_INTERVAL 3 /* number of seconds between prio updates */ +#define SALVAGE_COUNT_MAX 16 /* number of online salvages we + * allow before moving the volume + * into a permanent error state + * + * once this threshold is reached, + * the operator will have to manually + * issue a 'bos salvage' to bring + * the volume back online + */ + +/* check to see if we should salvage this volume + * returns 1 if salvage scheduled, 0 otherwise */ +static int +VCheckSalvage(register Volume * vp) +{ + int ret = 0; +#ifdef SALVSYNC_BUILD_CLIENT + if (vp->nUsers || vp->nWaiters) + return ret; + if (vp->salvage.requested) { + VScheduleSalvage_r(vp); + ret = 1; + } +#endif /* SALVSYNC_BUILD_CLIENT */ + return ret; +} + +/* + * request that a salvage be performed once + * ref counts reach zero + */ +int +VRequestSalvage_r(Volume * vp, int reason, int flags) +{ +#ifdef SALVSYNC_BUILD_CLIENT + if (programType != fileServer) + return 1; + + if (!vp->salvage.requested) { + vp->salvage.requested = 1; + vp->salvage.reason = reason; + vp->stats.last_salvage = FT_ApproxTime(); + if (flags & VOL_SALVAGE_INVALIDATE_HEADER) { + ReleaseVolumeHeader(vp->header); + } + if (vp->stats.salvages < SALVAGE_COUNT_MAX) { + VChangeState_r(vp, VOL_STATE_SALVAGING); + } else { + Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid); + VChangeState_r(vp, VOL_STATE_ERROR); + } + } +#endif /* SALVSYNC_BUILD_CLIENT */ + return 0; +} + +/* + * update salvage priority + */ +static int +VUpdateSalvagePriority_r(Volume * vp) +{ + int code, ret=0; + afs_uint32 now; + +#ifdef SALVSYNC_BUILD_CLIENT + vp->salvage.prio++; + now = FT_ApproxTime(); + + /* update the salvageserver priority queue occasionally so that + * frequently requested volumes get moved to the head of the queue + */ + if ((vp->salvage.scheduled) && + (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) { + code = SALVSYNC_SalvageVolume(vp->hashid, + VPartitionPath(vp->partition), + SALVSYNC_RAISEPRIO, + vp->salvage.reason, + vp->salvage.prio, + NULL); + vp->stats.last_salvage_req = now; + if (code != SYNC_OK) { + ret = 1; + } + } +#endif /* SALVSYNC_BUILD_CLIENT */ + return ret; +} + + +/* + * schedule a salvage with the salvage server + */ +static int +VScheduleSalvage_r(Volume * vp) +{ + int code, ret=0; +#ifdef SALVSYNC_BUILD_CLIENT + VolState state_save; + char partName[16]; + + if (vp->nWaiters || vp->nUsers) { + return 1; + } + + /* prevent endless salvage,attach,salvage,attach,... loops */ + if (vp->stats.salvages >= SALVAGE_COUNT_MAX) + return 1; + + if (!vp->salvage.scheduled) { + /* if we haven't previously scheduled a salvage, do so now + * + * set the volume to an exclusive state and drop the lock + * around the SALVSYNC call + */ + strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName)); + state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ); + V_attachFlags(vp) |= VOL_IS_BUSY; + VOL_UNLOCK; + + /* can't use V_id() since there's no guarantee + * we have the disk data header at this point */ + code = SALVSYNC_SalvageVolume(vp->hashid, + partName, + SALVSYNC_SALVAGE, + vp->salvage.reason, + vp->salvage.prio, + NULL); + VOL_LOCK; + VChangeState_r(vp, state_save); + V_attachFlags(vp) &= ~(VOL_IS_BUSY); + + if (code == SYNC_OK) { + vp->salvage.scheduled = 1; + vp->stats.salvages++; + vp->stats.last_salvage_req = FT_ApproxTime(); + IncUInt64(&VStats.salvages); + } else { + ret = 1; + switch(code) { + case SYNC_BAD_COMMAND: + case SYNC_COM_ERROR: + break; + case SYNC_DENIED: + Log("VScheduleSalvage_r: SALVSYNC request denied\n"); + break; + default: + Log("VScheduleSalvage_r: SALVSYNC unknown protocol error\n"); + break; + } + } + } +#endif /* SALVSYNC_BUILD_CLIENT */ + return ret; +} + +/* + * cancel a scheduled salvage operation + */ +static int +VCancelSalvage_r(Volume * vp, int reason) +{ + int code, ret = 0; + +#ifdef SALVSYNC_BUILD_CLIENT + if (vp->salvage.scheduled) { + code = SALVSYNC_SalvageVolume(vp->hashid, + VPartitionPath(vp->partition), + SALVSYNC_CANCEL, + reason, + 0, + NULL); + if (code == SYNC_OK) { + vp->salvage.scheduled = 0; + } else { + ret = 1; + } + } +#endif /* SALVSYNC_BUILD_CLIENT */ + return ret; +} + +/* This must be called by any volume utility which needs to run while the + file server is also running. This is separated from VInitVolumePackage so + that a utility can fork--and each of the children can independently + initialize communication with the file server */ +#ifdef SALVSYNC_BUILD_CLIENT +int +VConnectSALV(void) +{ + int retVal; + VOL_LOCK; + retVal = VConnectSALV_r(); + VOL_UNLOCK; + return retVal; +} + +int +VConnectSALV_r(void) +{ + assert((programType != salvageServer) && + (programType != volumeUtility)); + return SALVSYNC_clientInit(); +} + +int +VDisconnectSALV(void) +{ + int retVal; + VOL_LOCK; + VDisconnectSALV_r(); + VOL_UNLOCK; + return retVal; +} + +int +VDisconnectSALV_r(void) +{ + assert((programType != salvageServer) && + (programType != volumeUtility)); + return SALVSYNC_clientFinis(); +} + +int +VReconnectSALV(void) +{ + int retVal; + VOL_LOCK; + retVal = VReconnectSALV_r(); + VOL_UNLOCK; + return retVal; +} + +int +VReconnectSALV_r(void) +{ + assert((programType != salvageServer) && + (programType != volumeUtility)); + return SALVSYNC_clientReconnect(); +} +#endif /* SALVSYNC_BUILD_CLIENT */ +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* FSSYNC routines */ +/***************************************************/ + +/* This must be called by any volume utility which needs to run while the + file server is also running. This is separated from VInitVolumePackage so + that a utility can fork--and each of the children can independently + initialize communication with the file server */ +#ifdef FSSYNC_BUILD_CLIENT +int +VConnectFS(void) +{ + int retVal; + VOL_LOCK; + retVal = VConnectFS_r(); + VOL_UNLOCK; + return retVal; +} + +int +VConnectFS_r(void) +{ + int rc; + assert((VInit == 2) && + (programType != fileServer) && + (programType != salvager)); + rc = FSYNC_clientInit(); + if (rc) + VInit = 3; + return rc; +} + +void +VDisconnectFS_r(void) +{ + assert((programType != fileServer) && + (programType != salvager)); + FSYNC_clientFinis(); + VInit = 2; +} + +void +VDisconnectFS(void) +{ + VOL_LOCK; + VDisconnectFS_r(); + VOL_UNLOCK; +} + +static int +VChildProcReconnectFS_r(void) +{ + return FSYNC_clientChildProcReconnect(); +} + +int +VChildProcReconnectFS(void) +{ + int ret; + VOL_LOCK; + ret = VChildProcReconnectFS_r(); + VOL_UNLOCK; + return ret; +} +#endif /* FSSYNC_BUILD_CLIENT */ + + +/***************************************************/ +/* volume bitmap routines */ +/***************************************************/ + +/* + * For demand attach fs, flags parameter controls + * locking behavior. If (flags & VOL_ALLOC_BITMAP_WAIT) + * is set, then this function will create a reservation + * and block on any other exclusive operations. Otherwise, + * this function assumes the caller already has exclusive + * access to vp, and we just change the volume state. + */ +VnodeId +VAllocBitmapEntry_r(Error * ec, Volume * vp, + struct vnodeIndex *index, int flags) +{ + VnodeId ret; + register byte *bp, *ep; +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; +#endif /* AFS_DEMAND_ATTACH_FS */ + + *ec = 0; + /* This test is probably redundant */ if (!VolumeWriteable(vp)) { *ec = (bit32) VREADONLY; return 0; } + +#ifdef AFS_DEMAND_ATTACH_FS + if (flags & VOL_ALLOC_BITMAP_WAIT) { + VCreateReservation_r(vp); + VWaitExclusiveState_r(vp); + } + state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP); +#endif /* AFS_DEMAND_ATTACH_FS */ + #ifdef BITMAP_LATER if ((programType == fileServer) && !index->bitmap) { int i; +#ifndef AFS_DEMAND_ATTACH_FS + /* demand attach fs uses the volume state to avoid races. + * specialStatus field is not used at all */ int wasVBUSY = 0; if (vp->specialStatus == VBUSY) { if (vp->goingOffline) { /* vos dump waiting for the volume to @@ -1423,33 +3895,49 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex * from AddNewReadableResidency */ wasVBUSY = 1; } else { - VOL_UNLOCK; - while (vp->specialStatus == VBUSY) + while (vp->specialStatus == VBUSY) { #ifdef AFS_PTHREAD_ENV + VOL_UNLOCK; sleep(2); + VOL_LOCK; #else /* AFS_PTHREAD_ENV */ IOMGR_Sleep(2); -#endif /* AFS_PTHREAD_ENV */ - VOL_LOCK; - } - } - if (!index->bitmap) { - vp->specialStatus = VBUSY; /* Stop anyone else from using it. */ - for (i = 0; i < nVNODECLASSES; i++) { - VOL_UNLOCK; - GetBitmap(ec, vp, i); - VOL_LOCK; - if (*ec) { - vp->specialStatus = 0; - vp->shuttingDown = 1; /* Let who has it free it. */ - return NULL; +#endif /* AFS_DEMAND_ATTACH_FS */ } } + } +#endif /* !AFS_DEMAND_ATTACH_FS */ + + if (!index->bitmap) { +#ifndef AFS_DEMAND_ATTACH_FS + vp->specialStatus = VBUSY; /* Stop anyone else from using it. */ +#endif /* AFS_DEMAND_ATTACH_FS */ + for (i = 0; i < nVNODECLASSES; i++) { + VGetBitmap_r(ec, vp, i); + if (*ec) { +#ifdef AFS_DEMAND_ATTACH_FS + VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER); + *ec = VSALVAGING; +#else /* AFS_DEMAND_ATTACH_FS */ + DeleteVolumeFromHashTable(vp); + vp->shuttingDown = 1; /* Let who has it free it. */ + vp->specialStatus = 0; +#endif /* AFS_DEMAND_ATTACH_FS */ + ret = NULL; + goto done; + } + } +#ifndef AFS_DEMAND_ATTACH_FS if (!wasVBUSY) vp->specialStatus = 0; /* Allow others to have access. */ +#endif /* AFS_DEMAND_ATTACH_FS */ } } #endif /* BITMAP_LATER */ + +#ifdef AFS_DEMAND_ATTACH_FS + VOL_UNLOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ bp = index->bitmap + index->bitmapOffset; ep = index->bitmap + index->bitmapSize; while (bp < ep) { @@ -1460,7 +3948,11 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex bp++; o = ffs(~*bp) - 1; /* ffs is documented in BSTRING(3) */ *bp |= (1 << o); - return (VnodeId) ((bp - index->bitmap) * 8 + o); + ret = (VnodeId) ((bp - index->bitmap) * 8 + o); +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + goto done; } bp += sizeof(bit32) /* i.e. 4 */ ; } @@ -1474,7 +3966,19 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex index->bitmapOffset = index->bitmapSize; index->bitmapSize += VOLUME_BITMAP_GROWSIZE; *bp = 1; - return index->bitmapOffset * 8; + ret = index->bitmapOffset * 8; +#ifdef AFS_DEMAND_ATTACH_FS + VOL_LOCK; +#endif /* AFS_DEMAND_ATTACH_FS */ + + done: +#ifdef AFS_DEMAND_ATTACH_FS + VChangeState_r(vp, state_save); + if (flags & VOL_ALLOC_BITMAP_WAIT) { + VCancelReservation_r(vp); + } +#endif /* AFS_DEMAND_ATTACH_FS */ + return ret; } VnodeId @@ -1482,7 +3986,7 @@ VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index) { VnodeId retVal; VOL_LOCK; - retVal = VAllocBitmapEntry_r(ec, vp, index); + retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT); VOL_UNLOCK; return retVal; } @@ -1492,6 +3996,7 @@ VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index, unsigned bitNumber) { unsigned int offset; + *ec = 0; #ifdef BITMAP_LATER if (!index->bitmap) @@ -1516,70 +4021,13 @@ VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index, VOL_UNLOCK; } -void -VUpdateVolume_r(Error * ec, Volume * vp) -{ - *ec = 0; - if (programType == fileServer) - V_uniquifier(vp) = - (V_inUse(vp) ? V_nextVnodeUnique(vp) + - 200 : V_nextVnodeUnique(vp)); - /*printf("Writing volume header for '%s'\n", V_name(vp)); */ - WriteVolumeHeader_r(ec, vp); - if (*ec) { - Log("VUpdateVolume: error updating volume header, volume %u (%s)\n", - V_id(vp), V_name(vp)); - VForceOffline_r(vp); - } -} - -void -VUpdateVolume(Error * ec, Volume * vp) -{ - VOL_LOCK; - VUpdateVolume_r(ec, vp); - VOL_UNLOCK; -} - -void -VSyncVolume_r(Error * ec, Volume * vp) -{ - FdHandle_t *fdP; - VUpdateVolume_r(ec, vp); - if (!ec) { - int code; - fdP = IH_OPEN(V_diskDataHandle(vp)); - assert(fdP != NULL); - code = FDH_SYNC(fdP); - assert(code == 0); - FDH_CLOSE(fdP); - } -} - -void -VSyncVolume(Error * ec, Volume * vp) -{ - VOL_LOCK; - VSyncVolume_r(ec, vp); - VOL_UNLOCK; -} - +/* this function will drop the glock internally. + * for old pthread fileservers, this is safe thanks to vbusy. + * + * for demand attach fs, caller must have already called + * VCreateReservation_r and VWaitExclusiveState_r */ static void -FreeVolume(Volume * vp) -{ - int i; - if (!vp) - return; - for (i = 0; i < nVNODECLASSES; i++) - if (vp->vnodeIndex[i].bitmap) - free(vp->vnodeIndex[i].bitmap); - FreeVolumeHeader(vp); - DeleteVolumeFromHashTable(vp); - free(vp); -} - -static void -GetBitmap(Error * ec, Volume * vp, VnodeClass class) +VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class) { StreamHandle_t *file; int nVnodes; @@ -1592,9 +4040,17 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class) #ifdef BITMAP_LATER byte *BitMap = 0; #endif /* BITMAP_LATER */ +#ifdef AFS_DEMAND_ATTACH_FS + VolState state_save; +#endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; +#ifdef AFS_DEMAND_ATTACH_FS + state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP); +#endif /* AFS_DEMAND_ATTACH_FS */ + VOL_UNLOCK; + fdP = IH_OPEN(vip->handle); assert(fdP != NULL); file = FDH_FDOPEN(fdP, "r"); @@ -1655,6 +4111,8 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class) STREAM_CLOSE(file); FDH_CLOSE(fdP); free(vnode); + + VOL_LOCK; #ifdef BITMAP_LATER /* There may have been a racing condition with some other thread, both * creating the bitmaps for this volume. If the other thread was faster @@ -1666,8 +4124,106 @@ GetBitmap(Error * ec, Volume * vp, VnodeClass class) } else free((byte *) BitMap); #endif /* BITMAP_LATER */ +#ifdef AFS_DEMAND_ATTACH_FS + VChangeState_r(vp, state_save); +#endif /* AFS_DEMAND_ATTACH_FS */ } + +/***************************************************/ +/* demand attach fs state machine routines */ +/***************************************************/ + +#ifdef AFS_DEMAND_ATTACH_FS +/* wait for the volume to change states */ +static void +VWaitStateChange_r(Volume * vp) +{ + VolState state_save = V_attachState(vp); + + assert(vp->nWaiters || vp->nUsers); + do { + assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0); + } while (V_attachState(vp) == state_save); + assert(V_attachState(vp) != VOL_STATE_FREED); +} + +/* wait for blocking ops to end */ +static void +VWaitExclusiveState_r(Volume * vp) +{ + assert(vp->nWaiters || vp->nUsers); + while (IsExclusiveState(V_attachState(vp))) { + assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0); + } + assert(V_attachState(vp) != VOL_STATE_FREED); +} + +/* change state, and notify other threads, + * return previous state to caller */ +VolState +VChangeState_r(Volume * vp, VolState new_state) +{ + VolState old_state = V_attachState(vp); + + /* XXX profiling need to make sure these counters + * don't kill performance... */ + VStats.state_levels[old_state]--; + VStats.state_levels[new_state]++; + + V_attachState(vp) = new_state; + assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0); + return old_state; +} + +/* tells caller whether or not the current state requires + * exclusive access without holding glock */ +static int +IsExclusiveState(VolState state) +{ + switch (state) { + case VOL_STATE_UPDATING: + case VOL_STATE_ATTACHING: + case VOL_STATE_GET_BITMAP: + case VOL_STATE_HDR_LOADING: + case VOL_STATE_HDR_ATTACHING: + case VOL_STATE_OFFLINING: + case VOL_STATE_DETACHING: + return 1; + } + return 0; +} + +/* tell caller whether V_attachState is an error condition */ +static int +IsErrorState(VolState state) +{ + switch (state) { + case VOL_STATE_ERROR: + case VOL_STATE_SALVAGING: + return 1; + } + return 0; +} + +/* tell caller whether V_attachState is valid */ +static int +IsValidState(VolState state) +{ + if ((state >= 0) && + (state < VOL_STATE_COUNT) && + (state != VOL_STATE_FREED)) { + return 1; + } + return 0; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* Volume Path and Volume Number utility routines */ +/***************************************************/ + static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep) { @@ -1714,6 +4270,17 @@ VolumeExternalName(VolumeId volumeId) return name; } +static int +VolumeExternalName_r(VolumeId volumeId, char * name, size_t len) +{ + return afs_snprintf(name, len, VFORMAT, volumeId); +} + + +/***************************************************/ +/* Volume Usage Statistics routines */ +/***************************************************/ + #if OPENAFS_VOL_STATS #define OneDay (86400) /* 24 hours' worth of seconds */ #else @@ -1750,7 +4317,7 @@ VAdjustVolumeStatistics_r(register Volume * vp) unsigned int now = FT_ApproxTime(); if (now - V_dayUseDate(vp) > OneDay) { - register ndays, i; + register int ndays, i; ndays = (now - V_dayUseDate(vp)) / OneDay; for (i = 6; i > ndays - 1; i--) @@ -1799,7 +4366,7 @@ VBumpVolumeUsage_r(register Volume * vp) */ if ((V_dayUse(vp)++ & 127) == 0) { Error error; - VUpdateVolume_r(&error, vp); + VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT); } } @@ -1814,7 +4381,9 @@ VBumpVolumeUsage(register Volume * vp) void VSetDiskUsage_r(void) { +#ifndef AFS_DEMAND_ATTACH_FS static int FifteenMinuteCounter = 0; +#endif while (VInit < 2) { /* NOTE: Don't attempt to access the partitions list until the @@ -1828,10 +4397,13 @@ VSetDiskUsage_r(void) } VResetDiskUsage_r(); + +#ifndef AFS_DEMAND_ATTACH_FS if (++FifteenMinuteCounter == 3) { FifteenMinuteCounter = 0; VScanUpdateList(); } +#endif /* !AFS_DEMAND_ATTACH_FS */ } void @@ -1842,14 +4414,28 @@ VSetDiskUsage(void) VOL_UNLOCK; } + +/***************************************************/ +/* Volume Update List routines */ +/***************************************************/ + /* The number of minutes that a volume hasn't been updated before the * "Dont salvage" flag in the volume header will be turned on */ #define SALVAGE_INTERVAL (10*60) -static VolumeId *UpdateList; /* Pointer to array of Volume ID's */ -static int nUpdatedVolumes; /* Updated with entry in UpdateList, salvage after crash flag on */ -static int updateSize; /* number of entries possible */ -#define UPDATE_LIST_SIZE 100 /* size increment */ +/* + * demand attach fs + * + * volume update list functionality has been moved into the VLRU + * the DONT_SALVAGE flag is now set during VLRU demotion + */ + +#ifndef AFS_DEMAND_ATTACH_FS +static VolumeId *UpdateList = NULL; /* Pointer to array of Volume ID's */ +static int nUpdatedVolumes = 0; /* Updated with entry in UpdateList, salvage after crash flag on */ +static int updateSize = 0; /* number of entries possible */ +#define UPDATE_LIST_SIZE 128 /* initial size increment (must be a power of 2!) */ +#endif /* !AFS_DEMAND_ATTACH_FS */ void VAddToVolumeUpdateList_r(Error * ec, Volume * vp) @@ -1859,15 +4445,22 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp) if (V_dontSalvage(vp) == 0) return; V_dontSalvage(vp) = 0; - VSyncVolume_r(ec, vp); + VSyncVolume_r(ec, vp, 0); +#ifdef AFS_DEMAND_ATTACH_FS + V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV); +#else /* !AFS_DEMAND_ATTACH_FS */ if (*ec) return; - if (!UpdateList) { + if (UpdateList == NULL) { updateSize = UPDATE_LIST_SIZE; UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize); } else { if (nUpdatedVolumes == updateSize) { - updateSize += UPDATE_LIST_SIZE; + updateSize << 1; + if (updateSize > 524288) { + Log("warning: there is likely a bug in the volume update scanner\n"); + return; + } UpdateList = (VolumeId *) realloc(UpdateList, sizeof(VolumeId) * updateSize); @@ -1875,8 +4468,10 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp) } assert(UpdateList != NULL); UpdateList[nUpdatedVolumes++] = V_id(vp); +#endif /* !AFS_DEMAND_ATTACH_FS */ } +#ifndef AFS_DEMAND_ATTACH_FS static void VScanUpdateList(void) { @@ -1886,41 +4481,994 @@ VScanUpdateList(void) afs_uint32 now = FT_ApproxTime(); /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */ for (i = gap = 0; i < nUpdatedVolumes; i++) { + if (gap) + UpdateList[i - gap] = UpdateList[i]; + + /* XXX this routine needlessly messes up the Volume LRU by + * breaking the LRU temporal-locality assumptions..... + * we should use a special volume header allocator here */ vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]); if (error) { gap++; } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) { V_dontSalvage(vp) = DONT_SALVAGE; - VUpdateVolume_r(&error, vp); /* No need to fsync--not critical */ + VUpdateVolume_r(&error, vp, 0); /* No need to fsync--not critical */ gap++; } - if (vp) + + if (vp) { VPutVolume_r(vp); + } + #ifndef AFS_PTHREAD_ENV IOMGR_Poll(); #endif /* !AFS_PTHREAD_ENV */ } nUpdatedVolumes -= gap; } +#endif /* !AFS_DEMAND_ATTACH_FS */ + /***************************************************/ -/* Add on routines to manage a volume header cache */ +/* Volume LRU routines */ /***************************************************/ -static struct volHeader *volumeLRU; +/* demand attach fs + * volume LRU + * + * with demand attach fs, we attempt to soft detach(1) + * volumes which have not been accessed in a long time + * in order to speed up fileserver shutdown + * + * (1) by soft detach we mean a process very similar + * to VOffline, except the final state of the + * Volume will be VOL_STATE_PREATTACHED, instead + * of the usual VOL_STATE_UNATTACHED + */ +#ifdef AFS_DEMAND_ATTACH_FS + +/* implementation is reminiscent of a generational GC + * + * queue 0 is newly attached volumes. this queue is + * sorted by attach timestamp + * + * queue 1 is volumes that have been around a bit + * longer than queue 0. this queue is sorted by + * attach timestamp + * + * queue 2 is volumes tha have been around the longest. + * this queue is unsorted + * + * queue 3 is volumes that have been marked as + * candidates for soft detachment. this queue is + * unsorted + */ +#define VLRU_GENERATIONS 3 /* number of generations in VLRU */ +#define VLRU_QUEUES 5 /* total number of VLRU queues */ +struct VLRU_q { + volatile struct rx_queue q; + volatile int len; + volatile int busy; + pthread_cond_t cv; +}; +struct VLRU { + struct VLRU_q q[VLRU_QUEUES]; + + /* VLRU config */ + afs_uint32 promotion_interval[VLRU_GENERATIONS-1]; /* interval between promotions */ + afs_uint32 scan_interval[VLRU_GENERATIONS+1]; /* interval between scans for candidates */ + + /* state */ + int next_idx; + afs_uint32 last_promotion[VLRU_GENERATIONS-1]; /* timestamp of last promotion scan */ + afs_uint32 last_scan[VLRU_GENERATIONS+1]; /* timestamp of last detach scan */ + + int scanner_state; /* state of scanner thread */ + pthread_cond_t cv; /* state transition CV */ +}; + +static struct VLRU volume_LRU; + +/* valid scanner states */ +#define VLRU_SCANNER_STATE_OFFLINE 0 +#define VLRU_SCANNER_STATE_ONLINE 1 +#define VLRU_SCANNER_STATE_SHUTTING_DOWN 2 +#define VLRU_SCANNER_STATE_PAUSING 3 +#define VLRU_SCANNER_STATE_PAUSED 4 + +/* vlru disk data header stuff */ +#define VLRU_DISK_MAGIC 0x7a8b9cad +#define VLRU_DISK_VERSION 1 + +/* vlru default expiration time (for eventual fs state serialization of vlru data) */ +#define VLRU_DUMP_EXPIRATION_TIME (60*60*24*7) /* expire vlru data after 1 week */ + + +static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH; +static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL; +static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX; +static afs_uint32 VLRU_enabled = 1; + +/* queue synchronization routines */ +static void VLRU_BeginExclusive_r(struct VLRU_q * q); +static void VLRU_EndExclusive_r(struct VLRU_q * q); +static void VLRU_Wait_r(struct VLRU_q * q); + +/* set the VLRU parameters + * + * valid options are: + * VLRU_SET_THRESH -- set the period of inactivity after + * which volumes are eligible for being detached + * VLRU_SET_INTERVAL -- the time interval between calls + * to the volume LRU "garbage collector" + * VLRU_SET_MAX -- the max number of volumes to deallocate + * in one GC pass + */ +void +VLRU_SetOptions(int option, afs_uint32 val) +{ + if (option == VLRU_SET_THRESH) { + VLRU_offline_thresh = val; + } else if (option == VLRU_SET_INTERVAL) { + VLRU_offline_interval = val; + } else if (option == VLRU_SET_MAX) { + VLRU_offline_max = val; + } else if (option == VLRU_SET_ENABLED) { + VLRU_enabled = val; + } + VLRU_ComputeConstants(); +} + +/* compute the VLRU internal timing parameters based upon the user's inputs */ +static void +VLRU_ComputeConstants(void) +{ + afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval; + + /* compute the candidate scan interval */ + volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval; + + /* compute the promotion intervals */ + volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2; + volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4; + + if (factor > 16) { + /* compute the gen 0 scan interval */ + volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8; + } else { + /* compute the gen 0 scan interval */ + volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2; + } +} + +/* initialize VLRU */ +static void +VInitVLRU(void) +{ + pthread_t tid; + pthread_attr_t attrs; + int i; + + if (!VLRU_enabled) { + Log("VLRU: disabled\n"); + return; + } + + /* initialize each of the VLRU queues */ + for (i = 0; i < VLRU_QUEUES; i++) { + queue_Init(&volume_LRU.q[i]); + volume_LRU.q[i].len = 0; + volume_LRU.q[i].busy = 0; + assert(pthread_cond_init(&volume_LRU.q[i].cv, NULL) == 0); + } + + /* setup the timing constants */ + VLRU_ComputeConstants(); + + /* XXX put inside LogLevel check? */ + Log("VLRU: starting scanner with the following configuration parameters:\n"); + Log("VLRU: offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh); + Log("VLRU: running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval); + Log("VLRU: taking up to %d volumes offline per pass\n", VLRU_offline_max); + Log("VLRU: scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]); + Log("VLRU: scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]); + Log("VLRU: scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]); + + /* start up the VLRU scanner */ + volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE; + if (programType == fileServer) { + assert(pthread_cond_init(&volume_LRU.cv, NULL) == 0); + assert(pthread_attr_init(&attrs) == 0); + assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); + assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0); + } +} + +/* initialize LRU support for a volume */ +static void +VLRU_Init_Node_r(volatile Volume * vp) +{ + if (!VLRU_enabled) + return; + + assert(queue_IsNotOnQueue(&vp->vlru)); + vp->vlru.idx = VLRU_QUEUE_INVALID; +} + +/* add volume to VLRU + * now supports adding to queues other + * than new for vlru state restore + * caller MUST hold a ref count on vp */ +static void +VLRU_Add_r(volatile Volume * vp) +{ + int idx; + + if (!VLRU_enabled) + return; + + if (queue_IsOnQueue(&vp->vlru)) + return; + + VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]); + + /* repeat check since VLRU_Wait_r may have dropped + * the glock */ + if (queue_IsNotOnQueue(&vp->vlru)) { + idx = vp->vlru.idx; + if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) { + idx = vp->vlru.idx = VLRU_QUEUE_NEW; + } + queue_Prepend(&volume_LRU.q[idx], &vp->vlru); + volume_LRU.q[idx].len++; + V_attachFlags(vp) |= VOL_ON_VLRU; + vp->stats.last_promote = FT_ApproxTime(); + } +} + +/* delete volume from VLRU + * caller MUST hold a ref count on vp */ +static void +VLRU_Delete_r(volatile Volume * vp) +{ + int idx; + + if (!VLRU_enabled) + return; + + if (queue_IsNotOnQueue(&vp->vlru)) + return; + + /* handle races */ + do { + idx = vp->vlru.idx; + if (idx == VLRU_QUEUE_INVALID) + return; + VLRU_Wait_r(&volume_LRU.q[idx]); + } while (idx != vp->vlru.idx); + + /* now remove from the VLRU and update + * the appropriate counter */ + queue_Remove(&vp->vlru); + volume_LRU.q[idx].len--; + vp->vlru.idx = VLRU_QUEUE_INVALID; + V_attachFlags(vp) &= ~(VOL_ON_VLRU); +} + +/* signal that volume was just accessed. + * caller MUST hold a ref count on vp */ +static void +VLRU_UpdateAccess_r(volatile Volume * vp) +{ + afs_uint32 live_interval; + Volume * rvp = NULL; + + if (!VLRU_enabled) + return; + + if (queue_IsNotOnQueue(&vp->vlru)) + return; + + assert(V_attachFlags(vp) & VOL_ON_VLRU); + + /* update the access timestamp */ + vp->stats.last_get = FT_ApproxTime(); + + /* + * if the volume is on the soft detach candidate + * list, we need to safely move it back to a + * regular generation. this has to be done + * carefully so we don't race against the scanner + * thread. + */ + + /* if this volume is on the soft detach candidate queue, + * then grab exclusive access to the necessary queues */ + if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) { + rvp = vp; + VCreateReservation_r(rvp); + + VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]); + VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]); + VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + } + + /* make sure multiple threads don't race to update */ + if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) { + VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1); + } + + if (rvp) { + VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]); + VCancelReservation_r(rvp); + } +} + +/* switch a volume between two VLRU queues */ +static void +VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append) +{ + if (queue_IsNotOnQueue(&vp->vlru)) + return; + + queue_Remove(&vp->vlru); + volume_LRU.q[vp->vlru.idx].len--; + + /* put the volume back on the correct generational queue */ + if (append) { + queue_Append(&volume_LRU.q[new_idx], &vp->vlru); + } else { + queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru); + } + + volume_LRU.q[new_idx].len++; + vp->vlru.idx = new_idx; +} + +/* VLRU GC thread */ +static void * +VLRU_ScannerThread(void * args) +{ + afs_uint32 now, min_delay, delay; + afs_uint32 next_scan[VLRU_GENERATIONS]; + afs_uint32 next_promotion[VLRU_GENERATIONS]; + int i, min_idx, min_op, overdue, state; + + /* set t=0 for promotion cycle to be + * fileserver startup */ + now = FT_ApproxTime(); + for (i=0; i < VLRU_GENERATIONS-1; i++) { + volume_LRU.last_promotion[i] = now; + } + + /* don't start the scanner until VLRU_offline_thresh + * plus a small delay for VInitVolumePackage to finish + * has gone by */ + + sleep(VLRU_offline_thresh + 60); + + /* set t=0 for scan cycle to be now */ + now = FT_ApproxTime(); + for (i=0; i < VLRU_GENERATIONS+1; i++) { + volume_LRU.last_scan[i] = now; + } + + VOL_LOCK; + if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) { + volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE; + } + + while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) { + /* check to see if we've been asked to pause */ + if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) { + volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED; + assert(pthread_cond_broadcast(&volume_LRU.cv) == 0); + do { + assert(pthread_cond_wait(&volume_LRU.cv, &vol_glock_mutex) == 0); + } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED); + } + + /* scheduling can happen outside the glock */ + VOL_UNLOCK; + + /* figure out what is next on the schedule */ + + /* figure out a potential schedule for the new generation first */ + overdue = 0; + min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now; + min_idx = 0; + min_op = 0; + if (min_delay > volume_LRU.scan_interval[0]) { + /* unsigned overflow -- we're overdue to run this scan */ + min_delay = 0; + overdue = 1; + } + + /* if we're not overdue for gen 0, figure out schedule for candidate gen */ + if (!overdue) { + i = VLRU_QUEUE_CANDIDATE; + delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now; + if (delay < min_delay) { + min_delay = delay; + min_idx = i; + } + if (delay > volume_LRU.scan_interval[i]) { + /* unsigned overflow -- we're overdue to run this scan */ + min_delay = 0; + min_idx = i; + overdue = 1; + break; + } + } + + /* if we're still not overdue for something, figure out schedules for promotions */ + for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) { + delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now; + if (delay < min_delay) { + min_delay = delay; + min_idx = i; + min_op = 1; + } + if (delay > volume_LRU.promotion_interval[i]) { + /* unsigned overflow -- we're overdue to run this promotion */ + min_delay = 0; + min_idx = i; + min_op = 1; + overdue = 1; + break; + } + } + + /* sleep as needed */ + if (min_delay) { + sleep(min_delay); + } + + /* do whatever is next */ + VOL_LOCK; + if (min_op) { + VLRU_Promote_r(min_idx); + VLRU_Demote_r(min_idx+1); + } else { + VLRU_Scan_r(min_idx); + } + now = FT_ApproxTime(); + } + + Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state); + + /* signal that scanner is down */ + volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE; + assert(pthread_cond_broadcast(&volume_LRU.cv) == 0); + VOL_UNLOCK; + return NULL; +} + +/* run the promotions */ +static void +VLRU_Promote_r(int idx) +{ + int len, chaining, promote; + afs_uint32 now, thresh; + struct rx_queue *qp, *nqp; + Volume * vp, *start, *end; + + /* get exclusive access to two chains, and drop the glock */ + VLRU_Wait_r(&volume_LRU.q[idx]); + VLRU_BeginExclusive_r(&volume_LRU.q[idx]); + VLRU_Wait_r(&volume_LRU.q[idx+1]); + VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]); + VOL_UNLOCK; + + thresh = volume_LRU.promotion_interval[idx]; + now = FT_ApproxTime(); + + len = chaining = 0; + for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { + vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); + promote = (((vp->stats.last_promote + thresh) <= now) && + (vp->stats.last_get >= vp->stats.last_promote)); + + if (chaining) { + if (promote) { + vp->vlru.idx++; + len++; + start = vp; + } else { + /* promote and prepend chain */ + queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru); + chaining = 0; + } + } else { + if (promote) { + vp->vlru.idx++; + len++; + chaining = 1; + start = end = vp; + } + } + } + + if (chaining) { + /* promote and prepend */ + queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru); + } + + if (len) { + volume_LRU.q[idx].len -= len; + volume_LRU.q[idx+1].len += len; + } + + /* release exclusive access to the two chains */ + VOL_LOCK; + volume_LRU.last_promotion[idx] = now; + VLRU_EndExclusive_r(&volume_LRU.q[idx+1]); + VLRU_EndExclusive_r(&volume_LRU.q[idx]); +} + +/* run the demotions */ +static void +VLRU_Demote_r(int idx) +{ + Error ec; + int len, chaining, demote; + afs_uint32 now, thresh; + struct rx_queue *qp, *nqp; + Volume * vp, *start, *end; + Volume ** salv_flag_vec = NULL; + int salv_vec_offset = 0; + + assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD); + + /* get exclusive access to two chains, and drop the glock */ + VLRU_Wait_r(&volume_LRU.q[idx-1]); + VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]); + VLRU_Wait_r(&volume_LRU.q[idx]); + VLRU_BeginExclusive_r(&volume_LRU.q[idx]); + VOL_UNLOCK; + + /* no big deal if this allocation fails */ + if (volume_LRU.q[idx].len) { + salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *)); + } + + now = FT_ApproxTime(); + thresh = volume_LRU.promotion_interval[idx-1]; + + len = chaining = 0; + for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { + vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); + demote = (((vp->stats.last_promote + thresh) <= now) && + (vp->stats.last_get < (now - thresh))); + + /* we now do volume update list DONT_SALVAGE flag setting during + * demotion passes */ + if (salv_flag_vec && + !(V_attachFlags(vp) & VOL_HDR_DONTSALV) && + demote && + (vp->updateTime < (now - SALVAGE_INTERVAL)) && + (V_attachState(vp) == VOL_STATE_ATTACHED)) { + salv_flag_vec[salv_vec_offset++] = vp; + VCreateReservation_r(vp); + } + + if (chaining) { + if (demote) { + vp->vlru.idx--; + len++; + start = vp; + } else { + /* demote and append chain */ + queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru); + chaining = 0; + } + } else { + if (demote) { + vp->vlru.idx--; + len++; + chaining = 1; + start = end = vp; + } + } + } + + if (chaining) { + queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru); + } + + if (len) { + volume_LRU.q[idx].len -= len; + volume_LRU.q[idx-1].len += len; + } + + /* release exclusive access to the two chains */ + VOL_LOCK; + VLRU_EndExclusive_r(&volume_LRU.q[idx]); + VLRU_EndExclusive_r(&volume_LRU.q[idx-1]); + + /* now go back and set the DONT_SALVAGE flags as appropriate */ + if (salv_flag_vec) { + int i; + for (i = 0; i < salv_vec_offset; i++) { + vp = salv_flag_vec[i]; + if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) && + (vp->updateTime < (now - SALVAGE_INTERVAL)) && + (V_attachState(vp) == VOL_STATE_ATTACHED)) { + ec = VHold_r(vp); + if (!ec) { + V_attachFlags(vp) |= VOL_HDR_DONTSALV; + V_dontSalvage(vp) = DONT_SALVAGE; + VUpdateVolume_r(&ec, vp, 0); + VPutVolume_r(vp); + } + } + VCancelReservation_r(vp); + } + free(salv_flag_vec); + } +} + +/* run a pass of the VLRU GC scanner */ +static void +VLRU_Scan_r(int idx) +{ + afs_uint32 now, thresh; + struct rx_queue *qp, *nqp; + volatile Volume * vp; + int i, locked = 1; + + assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE); + + /* gain exclusive access to the idx VLRU */ + VLRU_Wait_r(&volume_LRU.q[idx]); + VLRU_BeginExclusive_r(&volume_LRU.q[idx]); + + if (idx != VLRU_QUEUE_CANDIDATE) { + /* gain exclusive access to the candidate VLRU */ + VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + } + + now = FT_ApproxTime(); + thresh = now - VLRU_offline_thresh; + + /* perform candidate selection and soft detaching */ + if (idx == VLRU_QUEUE_CANDIDATE) { + /* soft detach some volumes from the candidate pool */ + VOL_UNLOCK; + locked = 0; + + for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { + vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); + if (i >= VLRU_offline_max) { + break; + } + /* check timestamp to see if it's a candidate for soft detaching */ + if (vp->stats.last_get <= thresh) { + VOL_LOCK; + if (VCheckSoftDetach(vp, thresh)) + i++; + VOL_UNLOCK; + } + } + } else { + /* scan for volumes to become soft detach candidates */ + for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) { + vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); + + /* check timestamp to see if it's a candidate for soft detaching */ + if (vp->stats.last_get <= thresh) { + VCheckSoftDetachCandidate(vp, thresh); + } + + if (!(i&0x7f)) { /* lock coarsening optimization */ + VOL_UNLOCK; + pthread_yield(); + VOL_LOCK; + } + } + } + + /* relinquish exclusive access to the VLRU chains */ + if (!locked) { + VOL_LOCK; + } + volume_LRU.last_scan[idx] = now; + if (idx != VLRU_QUEUE_CANDIDATE) { + VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); + } + VLRU_EndExclusive_r(&volume_LRU.q[idx]); +} + +/* check whether volume is safe to soft detach + * caller MUST NOT hold a ref count on vp */ +static int +VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh) +{ + int ret=0; + + if (vp->nUsers || vp->nWaiters) + return 0; + + if (vp->stats.last_get <= thresh) { + ret = VSoftDetachVolume_r(vp, thresh); + } + + return ret; +} + +/* check whether volume should be made a + * soft detach candidate */ +static int +VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh) +{ + int idx, ret = 0; + if (vp->nUsers || vp->nWaiters) + return 0; + + idx = vp->vlru.idx; + + assert(idx == VLRU_QUEUE_NEW); + + if (vp->stats.last_get <= thresh) { + /* move to candidate pool */ + queue_Remove(&vp->vlru); + volume_LRU.q[VLRU_QUEUE_NEW].len--; + queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru); + vp->vlru.idx = VLRU_QUEUE_CANDIDATE; + volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++; + ret = 1; + } + + return ret; +} + + +/* begin exclusive access on VLRU */ +static void +VLRU_BeginExclusive_r(struct VLRU_q * q) +{ + assert(q->busy == 0); + q->busy = 1; +} + +/* end exclusive access on VLRU */ +static void +VLRU_EndExclusive_r(struct VLRU_q * q) +{ + assert(q->busy); + q->busy = 0; + assert(pthread_cond_broadcast(&q->cv) == 0); +} + +/* wait for another thread to end exclusive access on VLRU */ +static void +VLRU_Wait_r(struct VLRU_q * q) +{ + while(q->busy) { + assert(pthread_cond_wait(&q->cv, &vol_glock_mutex) == 0); + } +} + +/* demand attach fs + * volume soft detach + * + * caller MUST NOT hold a ref count on vp */ +static int +VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh) +{ + afs_uint32 ts_save; + int ret = 0; + + assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE); + + ts_save = vp->stats.last_get; + if (ts_save > thresh) + return 0; + + if (vp->nUsers || vp->nWaiters) + return 0; + + if (IsExclusiveState(V_attachState(vp))) { + return 0; + } + + switch (V_attachState(vp)) { + case VOL_STATE_UNATTACHED: + case VOL_STATE_PREATTACHED: + case VOL_STATE_ERROR: + case VOL_STATE_GOING_OFFLINE: + case VOL_STATE_SHUTTING_DOWN: + case VOL_STATE_SALVAGING: + volume_LRU.q[vp->vlru.idx].len--; + + /* create and cancel a reservation to + * give the volume an opportunity to + * be deallocated */ + VCreateReservation_r(vp); + queue_Remove(&vp->vlru); + vp->vlru.idx = VLRU_QUEUE_INVALID; + V_attachFlags(vp) &= ~(VOL_ON_VLRU); + VCancelReservation_r(vp); + return 0; + } + + /* hold the volume and take it offline. + * no need for reservations, as VHold_r + * takes care of that internally. */ + if (VHold_r(vp) == 0) { + /* vhold drops the glock, so now we should + * check to make sure we aren't racing against + * other threads. if we are racing, offlining vp + * would be wasteful, and block the scanner for a while + */ + if (vp->nWaiters || + (vp->nUsers > 1) || + (vp->shuttingDown) || + (vp->goingOffline) || + (vp->stats.last_get != ts_save)) { + /* looks like we're racing someone else. bail */ + VPutVolume_r(vp); + vp = NULL; + } else { + /* pull it off the VLRU */ + assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE); + volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--; + queue_Remove(&vp->vlru); + vp->vlru.idx = VLRU_QUEUE_INVALID; + V_attachFlags(vp) &= ~(VOL_ON_VLRU); + + /* take if offline */ + VOffline_r(vp, "volume has been soft detached"); + + /* invalidate the volume header cache */ + FreeVolumeHeader(vp); + + /* update stats */ + IncUInt64(&VStats.soft_detaches); + vp->stats.soft_detaches++; + + /* put in pre-attached state so demand + * attacher can work on it */ + VChangeState_r(vp, VOL_STATE_PREATTACHED); + ret = 1; + } + } + return ret; +} +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* Volume Header Cache routines */ +/***************************************************/ + +struct volume_hdr_LRU_t volume_hdr_LRU; /* Allocate a bunch of headers; string them together */ static void -InitLRU(int howMany) +VInitVolumeHeaderCache(afs_uint32 howMany) { register struct volHeader *hp; if (programType != fileServer) return; + queue_Init(&volume_hdr_LRU); +#ifdef AFS_DEMAND_ATTACH_FS + volume_hdr_LRU.stats.free = 0; + volume_hdr_LRU.stats.used = howMany; + volume_hdr_LRU.stats.attached = 0; +#endif hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader))); while (howMany--) ReleaseVolumeHeader(hp++); } +#ifdef AFS_DEMAND_ATTACH_FS +/* Get a volume header from the LRU list; update the old one if necessary */ +/* Returns 1 if there was already a header, which is removed from the LRU list */ +/* caller MUST has a ref count on vp */ +static int +GetVolumeHeader(register Volume * vp) +{ + Error error; + register struct volHeader *hd; + int old; + static int everLogged = 0; + + /* XXX debug 9/19/05 we've apparently got + * a ref counting bug somewhere that's + * breaking the nUsers == 0 => header on LRU + * assumption */ + if (vp->header && queue_IsNotOnQueue(vp->header)) { + Log("nUsers == 0, but header not on LRU\n"); + return 1; + } + + old = (vp->header != NULL); /* old == volume already has a header */ + + if (programType != fileServer) { + /* for volume utilities, we allocate volHeaders as needed */ + if (!vp->header) { + hd = (struct volHeader *)calloc(1, sizeof(*vp->header)); + assert(hd != NULL); + vp->header = hd; + hd->back = vp; + V_attachFlags(vp) |= VOL_HDR_ATTACHED; + } + } else { + if (old) { + /* the header we previously dropped in the lru is + * still available. pull it off the lru and return */ + hd = vp->header; + queue_Remove(hd); + assert(hd->back == vp); + } else { + /* we need to grab a new element off the LRU */ + if (queue_IsNotEmpty(&volume_hdr_LRU)) { + /* grab an element and pull off of LRU */ + hd = queue_First(&volume_hdr_LRU, volHeader); + queue_Remove(hd); + } else { + /* LRU is empty, so allocate a new volHeader + * this is probably indicative of a leak, so let the user know */ + hd = (struct volHeader *)calloc(1, sizeof(struct volHeader)); + assert(hd != NULL); + if (!everLogged) { + Log("****Allocated more volume headers, probably leak****\n"); + everLogged = 1; + } + volume_hdr_LRU.stats.free++; + } + if (hd->back) { + VolState vp_save, back_save; + /* this header used to belong to someone else. + * we'll need to check if the header needs to + * be sync'd out to disk */ + + /* if hd->back were in an exclusive state, then + * its volHeader would not be on the LRU... */ + assert(!IsExclusiveState(V_attachState(hd->back))); + + if (hd->diskstuff.inUse) { + /* volume was in use, so we'll need to sync + * its header to disk */ + back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING); + vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING); + VCreateReservation_r(hd->back); + VOL_UNLOCK; + + WriteVolumeHeader_r(&error, hd->back); + /* Ignore errors; catch them later */ + + VOL_LOCK; + } + + V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU); + hd->back->header = NULL; + + if (hd->diskstuff.inUse) { + VChangeState_r(hd->back, back_save); + VCancelReservation_r(hd->back); + VChangeState_r(vp, vp_save); + } + } else { + volume_hdr_LRU.stats.attached++; + } + hd->back = vp; + vp->header = hd; + V_attachFlags(vp) |= VOL_HDR_ATTACHED; + } + volume_hdr_LRU.stats.free--; + volume_hdr_LRU.stats.used++; + } + IncUInt64(&VStats.hdr_gets); + IncUInt64(&vp->stats.hdr_gets); + vp->stats.last_hdr_get = FT_ApproxTime(); + return old; +} +#else /* AFS_DEMAND_ATTACH_FS */ /* Get a volume header from the LRU list; update the old one if necessary */ /* Returns 1 if there was already a header, which is removed from the LRU list */ static int @@ -1932,7 +5480,9 @@ GetVolumeHeader(register Volume * vp) static int everLogged = 0; old = (vp->header != NULL); /* old == volume already has a header */ + if (programType != fileServer) { + /* for volume utilities, we allocate volHeaders as needed */ if (!vp->header) { hd = (struct volHeader *)calloc(1, sizeof(*vp->header)); assert(hd != NULL); @@ -1940,45 +5490,98 @@ GetVolumeHeader(register Volume * vp) hd->back = vp; } } else { + /* for the fileserver, we keep a volume header cache */ if (old) { + /* the header we previously dropped in the lru is + * still available. pull it off the lru and return */ hd = vp->header; - if (volumeLRU == hd) - volumeLRU = hd->next; + queue_Remove(hd); assert(hd->back == vp); } else { - if (volumeLRU) - /* not currently in use and least recently used */ - hd = volumeLRU->prev; - else { - hd = (struct volHeader *)calloc(1, sizeof(*vp->header)); - /* make it look like single elt LRU */ - hd->prev = hd->next = hd; + /* we need to grab a new element off the LRU */ + if (queue_IsNotEmpty(&volume_hdr_LRU)) { + /* grab an element */ + hd = queue_First(&volume_hdr_LRU, volHeader); + queue_Remove(hd); + } else { + /* LRU is empty, so allocate a new volHeader + * this is probably indicative of a leak, so let the user know */ + hd = (struct volHeader *)calloc(1, sizeof(struct volHeader)); + assert(hd != NULL); if (!everLogged) { Log("****Allocated more volume headers, probably leak****\n"); everLogged = 1; } } if (hd->back) { + /* this header used to belong to someone else. + * we'll need to check if the header needs to + * be sync'd out to disk */ + if (hd->diskstuff.inUse) { WriteVolumeHeader_r(&error, hd->back); /* Ignore errors; catch them later */ } - hd->back->header = 0; + hd->back->header = NULL; } hd->back = vp; vp->header = hd; } - if (hd->next) { /* hd->next != 0 --> in LRU chain (we zero it later) */ - hd->prev->next = hd->next; /* pull hd out of LRU list */ - hd->next->prev = hd->prev; /* if hd only element, this is noop */ - } - hd->next = hd->prev = 0; - /* if not in LRU chain, next test won't be true */ - if (hd == volumeLRU) /* last header item, turn into empty list */ - volumeLRU = NULL; } return old; } +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/* make sure a volume header is attached to + * vp, and has the correct data loaded from + * disk. */ +#ifdef AFS_DEMAND_ATTACH_FS +/* caller MUST hold a ref count on vp */ +static void +LoadVolumeHeader(Error * ec, Volume * vp) +{ + VolState state_save; + *ec = 0; + + if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { + IncUInt64(&VStats.hdr_loads); + state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING); + VOL_UNLOCK; + + ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), + sizeof(V_disk(vp)), VOLUMEINFOMAGIC, + VOLUMEINFOVERSION); + IncUInt64(&vp->stats.hdr_loads); + + VOL_LOCK; + if (!*ec) + V_attachFlags(vp) |= VOL_HDR_LOADED; + VChangeState_r(vp, state_save); + } + if (*ec) { + /* maintain (nUsers==0) => header in LRU invariant */ + ReleaseVolumeHeader(vp->header); + } +} +#else /* AFS_DEMAND_ATTACH_FS */ +static void +LoadVolumeHeader(Error * ec, Volume * vp) +{ + *ec = 0; + if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { + IncUInt64(&VStats.hdr_loads); + + ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), + sizeof(V_disk(vp)), VOLUMEINFOMAGIC, + VOLUMEINFOVERSION); + } + if (*ec) { + /* maintain (nUsers==0) => header in LRU invariant */ + ReleaseVolumeHeader(vp->header); + } +} +#endif /* AFS_DEMAND_ATTACH_FS */ /* Put it at the top of the LRU chain */ static void @@ -1986,18 +5589,22 @@ ReleaseVolumeHeader(register struct volHeader *hd) { if (programType != fileServer) return; - if (!hd || hd->next) /* no header, or header already released */ + if (!hd || queue_IsOnQueue(hd)) /* no header, or header already released */ return; - if (!volumeLRU) { - hd->next = hd->prev = hd; - } else { - hd->prev = volumeLRU->prev; - hd->next = volumeLRU; - hd->prev->next = hd->next->prev = hd; + queue_Append(&volume_hdr_LRU, hd); +#ifdef AFS_DEMAND_ATTACH_FS + if (hd->back) { + V_attachFlags(hd->back) |= VOL_HDR_IN_LRU; } - volumeLRU = hd; + volume_hdr_LRU.stats.free++; + volume_hdr_LRU.stats.used--; +#endif } +/* for fileserver, return header to LRU, and + * invalidate it as a cache entry. + * + * for volume utilities, free the heap space */ static void FreeVolumeHeader(register Volume * vp) { @@ -2006,57 +5613,349 @@ FreeVolumeHeader(register Volume * vp) return; if (programType == fileServer) { ReleaseVolumeHeader(hd); - hd->back = 0; + hd->back = NULL; } else { free(hd); } - vp->header = 0; +#ifdef AFS_DEMAND_ATTACH_FS + V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED); + volume_hdr_LRU.stats.attached--; +#endif + vp->header = NULL; } /***************************************************/ -/* Routines to add volume to hash chain, delete it */ +/* Volume Hash Table routines */ /***************************************************/ +int +VSetVolHashSize(int logsize) +{ + /* 64 to 16384 hash buckets seems like a reasonable range */ + if ((logsize < 6 ) || (logsize > 14)) { + return -1; + } + + if (!VInit) { + VolumeHashTable.Size = 1 << logsize; + VolumeHashTable.Mask = VolumeHashTable.Size - 1; + } else { + /* we can't yet support runtime modification of this + * parameter. we'll need a configuration rwlock to + * make runtime modification feasible.... */ + return -1; + } + return 0; +} + +static void +VInitVolumeHash(void) +{ + register int i; + + VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size, + sizeof(VolumeHashChainHead)); + assert(VolumeHashTable.Table != NULL); + + for (i=0; i < VolumeHashTable.Size; i++) { + queue_Init(&VolumeHashTable.Table[i]); +#ifdef AFS_DEMAND_ATTACH_FS + assert(pthread_cond_init(&VolumeHashTable.Table[i].chain_busy_cv, NULL) == 0); +#endif /* AFS_DEMAND_ATTACH_FS */ + } +} + +/* for demand-attach, caller MUST hold a ref count on vp */ static void AddVolumeToHashTable(register Volume * vp, int hashid) { - int hash = VOLUME_HASH(hashid); + VolumeHashChainHead * head; + + if (queue_IsOnQueue(vp)) + return; + + head = &VolumeHashTable.Table[VOLUME_HASH(hashid)]; + +#ifdef AFS_DEMAND_ATTACH_FS + /* wait for the hash chain to become available */ + VHashWait_r(head); + + V_attachFlags(vp) |= VOL_IN_HASH; + vp->chainCacheCheck = ++head->cacheCheck; +#endif /* AFS_DEMAND_ATTACH_FS */ + + head->len++; vp->hashid = hashid; - vp->hashNext = VolumeHashTable[hash]; - VolumeHashTable[hash] = vp; + queue_Append(head, vp); vp->vnodeHashOffset = VolumeHashOffset_r(); } +/* for demand-attach, caller MUST hold a ref count on vp */ static void DeleteVolumeFromHashTable(register Volume * vp) { - int hash = VOLUME_HASH(vp->hashid); - if (VolumeHashTable[hash] == vp) - VolumeHashTable[hash] = vp->hashNext; - else { - Volume *tvp = VolumeHashTable[hash]; - if (tvp == NULL) - return; - while (tvp->hashNext && tvp->hashNext != vp) - tvp = tvp->hashNext; - if (tvp->hashNext == NULL) - return; - tvp->hashNext = vp->hashNext; - } - vp->hashid = 0; + VolumeHashChainHead * head; + + if (!queue_IsOnQueue(vp)) + return; + + head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)]; + +#ifdef AFS_DEMAND_ATTACH_FS + /* wait for the hash chain to become available */ + VHashWait_r(head); + + V_attachFlags(vp) &= ~(VOL_IN_HASH); + head->cacheCheck++; +#endif /* AFS_DEMAND_ATTACH_FS */ + + head->len--; + queue_Remove(vp); + /* do NOT reset hashid to zero, as the online + * salvager package may need to know the volume id + * after the volume is removed from the hash */ } +/* - look up a volume id in the hash table + * - occasionally rebalance hash chains + * - update lookup statistics accordingly + */ +/* the hint parameter allows us to short-circuit on + * DEMAND_ATTACH_FS if the cacheChecks match between + * the hash chain head and hint + * caller MUST hold a refcount on hint */ +Volume * +VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint) +{ + register int looks = 0; + Volume * vp, *np, *pp; + VolumeHashChainHead * head; + *ec = 0; + + head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)]; + +#ifdef AFS_DEMAND_ATTACH_FS + /* wait for the hash chain to become available */ + VHashWait_r(head); + + /* check to see if we can short circuit without walking the hash chain */ + if (hint && (hint->chainCacheCheck == head->cacheCheck)) { + IncUInt64(&hint->stats.hash_short_circuits); + return hint; + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + /* someday we need to either do per-chain locks, RWlocks, + * or both for volhash access. + * (and move to a data structure with better cache locality) */ + + /* search the chain for this volume id */ + for(queue_Scan(head, vp, np, Volume)) { + looks++; + if ((vp->hashid == volumeId)) { + break; + } + } + + if (queue_IsEnd(head, vp)) { + vp = NULL; + } + +#ifdef AFS_DEMAND_ATTACH_FS + /* update hash chain statistics */ + { + afs_uint64 lks; + FillInt64(lks, 0, looks); + AddUInt64(head->looks, lks, &head->looks); + AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks); + IncUInt64(&head->gets); + } + + if (vp) { + afs_uint64 thresh; + IncUInt64(&vp->stats.hash_lookups); + + /* for demand attach fileserver, we permit occasional hash chain reordering + * so that frequently looked up volumes move towards the head of the chain */ + pp = queue_Prev(vp, Volume); + if (!queue_IsEnd(head, pp)) { + FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD); + AddUInt64(thresh, pp->stats.hash_lookups, &thresh); + if (GEInt64(vp->stats.hash_lookups, thresh)) { + VReorderHash_r(head, pp, vp); + } + } + + /* update the short-circuit cache check */ + vp->chainCacheCheck = head->cacheCheck; + } +#endif /* AFS_DEMAND_ATTACH_FS */ + + return vp; +} + +#ifdef AFS_DEMAND_ATTACH_FS +/* perform volume hash chain reordering. + * + * advance a subchain beginning at vp ahead of + * the adjacent subchain ending at pp */ +static void +VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp) +{ + Volume *tp, *np, *lp; + afs_uint64 move_thresh; + + /* this should never be called if the chain is already busy, so + * no need to wait for other exclusive chain ops to finish */ + + /* this is a rather heavy set of operations, + * so let's set the chain busy flag and drop + * the vol_glock */ + VHashBeginExclusive_r(head); + VOL_UNLOCK; + + /* scan forward in the chain from vp looking for the last element + * in the chain we want to advance */ + FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH); + AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh); + for(queue_ScanFrom(head, vp, tp, np, Volume)) { + if (LTInt64(tp->stats.hash_lookups, move_thresh)) { + break; + } + } + lp = queue_Prev(tp, Volume); + + /* scan backwards from pp to determine where to splice and + * insert the subchain we're advancing */ + for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) { + if (GTInt64(tp->stats.hash_lookups, move_thresh)) { + break; + } + } + tp = queue_Next(tp, Volume); + + /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */ + queue_MoveChainBefore(tp,vp,lp); + + VOL_LOCK; + IncUInt64(&VStats.hash_reorders); + head->cacheCheck++; + IncUInt64(&head->reorders); + + /* wake up any threads waiting for the hash chain */ + VHashEndExclusive_r(head); +} + + +/* demand-attach fs volume hash + * asynchronous exclusive operations */ + +/* take exclusive control over the hash chain */ +static void +VHashBeginExclusive_r(VolumeHashChainHead * head) +{ + assert(head->busy == 0); + head->busy = 1; +} + +/* relinquish exclusive control over the hash chain */ +static void +VHashEndExclusive_r(VolumeHashChainHead * head) +{ + assert(head->busy); + head->busy = 0; + assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0); +} + +/* wait for another thread to finish its exclusive ops */ +static void +VHashWait_r(VolumeHashChainHead * head) +{ + while (head->busy) { + assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0); + } +} +#endif /* AFS_DEMAND_ATTACH_FS */ + + +/***************************************************/ +/* Volume by Partition List routines */ +/***************************************************/ + +/* + * demand attach fileserver adds a + * linked list of volumes to each + * partition object, thus allowing + * for quick enumeration of all + * volumes on a partition + */ + +#ifdef AFS_DEMAND_ATTACH_FS +static void +AddVolumeToVByPList_r(Volume * vp) +{ + if (queue_IsNotOnQueue(&vp->vol_list)) { + queue_Append(&vp->partition->vol_list, &vp->vol_list); + V_attachFlags(vp) |= VOL_ON_VBYP_LIST; + vp->partition->vol_list.len++; + } +} + +static void +DeleteVolumeFromVByPList_r(Volume * vp) +{ + if (queue_IsOnQueue(&vp->vol_list)) { + queue_Remove(&vp->vol_list); + V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST); + vp->partition->vol_list.len--; + } +} + +/* take exclusive control over the list */ +static void +VVByPListBeginExclusive_r(struct DiskPartition * dp) +{ + assert(dp->vol_list.busy == 0); + dp->vol_list.busy = 1; +} + +/* relinquish exclusive control over the list */ +static void +VVByPListEndExclusive_r(struct DiskPartition * dp) +{ + assert(dp->vol_list.busy); + dp->vol_list.busy = 0; + assert(pthread_cond_broadcast(&dp->vol_list.cv) == 0); +} + +/* wait for another thread to finish its exclusive ops */ +static void +VVByPListWait_r(struct DiskPartition * dp) +{ + while (dp->vol_list.busy) { + assert(pthread_cond_wait(&dp->vol_list.cv, &vol_glock_mutex) == 0); + } +} +#endif /* AFS_DEMAND_ATTACH_FS */ + +/***************************************************/ +/* Volume Cache Statistics routines */ +/***************************************************/ + void VPrintCacheStats_r(void) { + afs_uint32 get_hi, get_lo, load_hi, load_lo; register struct VnodeClassInfo *vcp; vcp = &VnodeClassInfo[vLarge]; Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes); vcp = &VnodeClassInfo[vSmall]; Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes); + SplitInt64(VStats.hdr_gets, get_hi, get_lo); + SplitInt64(VStats.hdr_loads, load_hi, load_lo); Log("Volume header cache, %d entries, %d gets, %d replacements\n", - VolumeCacheSize, VolumeGets, VolumeReplacements); + VStats.hdr_cache_size, get_lo, load_lo); } void @@ -2067,3 +5966,259 @@ VPrintCacheStats(void) VOL_UNLOCK; } +#ifdef AFS_DEMAND_ATTACH_FS +static double +UInt64ToDouble(afs_uint64 * x) +{ + static double c32 = 4.0 * 1.073741824 * 1000000000.0; + afs_uint32 h, l; + SplitInt64(*x, h, l); + return (((double)h) * c32) + ((double) l); +} + +static char * +DoubleToPrintable(double x, char * buf, int len) +{ + static double billion = 1000000000.0; + afs_uint32 y[3]; + + y[0] = (afs_uint32) (x / (billion * billion)); + y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion); + y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion))); + + if (y[0]) { + snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]); + } else if (y[1]) { + snprintf(buf, len, "%d%09d", y[1], y[2]); + } else { + snprintf(buf, len, "%d", y[2]); + } + buf[len-1] = '\0'; + return buf; +} + +static void +VPrintExtendedCacheStats_r(int flags) +{ + int i, j; + struct stats { + double min; + double max; + double sum; + double avg; + }; + struct stats looks, gets, reorders, len; + struct stats ch_looks, ch_gets, ch_reorders; + char pr_buf[4][32]; + VolumeHashChainHead *head; + Volume *vp, *np; + + /* zero out stats */ + memset(&looks, 0, sizeof(struct stats)); + memset(&gets, 0, sizeof(struct stats)); + memset(&reorders, 0, sizeof(struct stats)); + memset(&len, 0, sizeof(struct stats)); + memset(&ch_looks, 0, sizeof(struct stats)); + memset(&ch_gets, 0, sizeof(struct stats)); + memset(&ch_reorders, 0, sizeof(struct stats)); + + for (i = 0; i < VolumeHashTable.Size; i++) { + head = &VolumeHashTable.Table[i]; + + VHashWait_r(head); + VHashBeginExclusive_r(head); + VOL_UNLOCK; + + ch_looks.sum = UInt64ToDouble(&head->looks); + ch_gets.sum = UInt64ToDouble(&head->gets); + ch_reorders.sum = UInt64ToDouble(&head->reorders); + + /* update global statistics */ + { + looks.sum += ch_looks.sum; + gets.sum += ch_gets.sum; + reorders.sum += ch_reorders.sum; + len.sum += (double)head->len; + + if (i == 0) { + len.min = (double) head->len; + len.max = (double) head->len; + looks.min = ch_looks.sum; + looks.max = ch_looks.sum; + gets.min = ch_gets.sum; + gets.max = ch_gets.sum; + reorders.min = ch_reorders.sum; + reorders.max = ch_reorders.sum; + } else { + if (((double)head->len) < len.min) + len.min = (double) head->len; + if (((double)head->len) > len.max) + len.max = (double) head->len; + if (ch_looks.sum < looks.min) + looks.min = ch_looks.sum; + else if (ch_looks.sum > looks.max) + looks.max = ch_looks.sum; + if (ch_gets.sum < gets.min) + gets.min = ch_gets.sum; + else if (ch_gets.sum > gets.max) + gets.max = ch_gets.sum; + if (ch_reorders.sum < reorders.min) + reorders.min = ch_reorders.sum; + else if (ch_reorders.sum > reorders.max) + reorders.max = ch_reorders.sum; + } + } + + if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) { + /* compute detailed per-chain stats */ + struct stats hdr_loads, hdr_gets; + double v_looks, v_loads, v_gets; + + /* initialize stats with data from first element in chain */ + vp = queue_First(head, Volume); + v_looks = UInt64ToDouble(&vp->stats.hash_lookups); + v_loads = UInt64ToDouble(&vp->stats.hdr_loads); + v_gets = UInt64ToDouble(&vp->stats.hdr_gets); + ch_gets.min = ch_gets.max = v_looks; + hdr_loads.min = hdr_loads.max = v_loads; + hdr_gets.min = hdr_gets.max = v_gets; + hdr_loads.sum = hdr_gets.sum = 0; + + vp = queue_Next(vp, Volume); + + /* pull in stats from remaining elements in chain */ + for (queue_ScanFrom(head, vp, vp, np, Volume)) { + v_looks = UInt64ToDouble(&vp->stats.hash_lookups); + v_loads = UInt64ToDouble(&vp->stats.hdr_loads); + v_gets = UInt64ToDouble(&vp->stats.hdr_gets); + + hdr_loads.sum += v_loads; + hdr_gets.sum += v_gets; + + if (v_looks < ch_gets.min) + ch_gets.min = v_looks; + else if (v_looks > ch_gets.max) + ch_gets.max = v_looks; + + if (v_loads < hdr_loads.min) + hdr_loads.min = v_loads; + else if (v_loads > hdr_loads.max) + hdr_loads.max = v_loads; + + if (v_gets < hdr_gets.min) + hdr_gets.min = v_gets; + else if (v_gets > hdr_gets.max) + hdr_gets.max = v_gets; + } + + /* compute per-chain averages */ + ch_gets.avg = ch_gets.sum / ((double)head->len); + hdr_loads.avg = hdr_loads.sum / ((double)head->len); + hdr_gets.avg = hdr_gets.sum / ((double)head->len); + + /* dump per-chain stats */ + Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n", + i, head->len, + DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1]))); + Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3]))); + Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3]))); + Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3]))); + } else if (flags & VOL_STATS_PER_CHAIN) { + /* dump simple per-chain stats */ + Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n", + i, head->len, + DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2]))); + } + + VOL_LOCK; + VHashEndExclusive_r(head); + } + + VOL_UNLOCK; + + /* compute global averages */ + len.avg = len.sum / ((double)VolumeHashTable.Size); + looks.avg = looks.sum / ((double)VolumeHashTable.Size); + gets.avg = gets.sum / ((double)VolumeHashTable.Size); + reorders.avg = reorders.sum / ((double)VolumeHashTable.Size); + + /* dump global stats */ + Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size); + Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3]))); + Log(" looks : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3]))); + Log(" gets : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3]))); + Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n", + DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])), + DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])), + DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])), + DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3]))); + + /* print extended disk related statistics */ + { + struct DiskPartition * diskP; + afs_uint32 vol_count[VOLMAXPARTS+1]; + byte part_exists[VOLMAXPARTS+1]; + Device id; + int i; + + memset(vol_count, 0, sizeof(vol_count)); + memset(part_exists, 0, sizeof(part_exists)); + + VOL_LOCK; + + for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { + id = diskP->device; + vol_count[id] = diskP->vol_list.len; + part_exists[id] = 1; + } + + VOL_UNLOCK; + for (i = 0; i <= VOLMAXPARTS; i++) { + if (part_exists[i]) { + diskP = VGetPartitionById_r(i, 0); + if (diskP) { + Log("Partition %s has %d online volumes\n", + VPartitionPath(diskP), diskP->vol_list.len); + } + } + } + VOL_LOCK; + } + +} + +void +VPrintExtendedCacheStats(int flags) +{ + VOL_LOCK; + VPrintExtendedCacheStats_r(flags); + VOL_UNLOCK; +} +#endif /* AFS_DEMAND_ATTACH_FS */ diff --git a/src/vol/volume.h b/src/vol/volume.h index c66a09b7c5..09190bc310 100644 --- a/src/vol/volume.h +++ b/src/vol/volume.h @@ -5,6 +5,8 @@ * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html + * + * Portions Copyright (c) 2006 Sine Nomine Associates */ /* @@ -24,44 +26,44 @@ #define VolumeWriteable2(vol) (vol.type == readwriteVolume) typedef bit32 FileOffset; /* Offset in this file */ #define Date afs_uint32 +#include "daemon_com.h" +#include "fssync.h" #ifdef AFS_PTHREAD_ENV #include #include extern pthread_mutex_t vol_glock_mutex; -extern pthread_mutex_t vol_attach_mutex; -extern pthread_mutex_t vol_fsync_mutex; extern pthread_mutex_t vol_trans_mutex; extern pthread_cond_t vol_put_volume_cond; extern pthread_cond_t vol_sleep_cond; extern int vol_attach_threads; -/* this lock has been deprecated */ -#define VATTACH_LOCK -#define VATTACH_UNLOCK #define VOL_LOCK \ assert(pthread_mutex_lock(&vol_glock_mutex) == 0) #define VOL_UNLOCK \ assert(pthread_mutex_unlock(&vol_glock_mutex) == 0) -#define VFSYNC_LOCK \ - assert(pthread_mutex_lock(&vol_fsync_mutex) == 0) -#define VFSYNC_UNLOCK \ - assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0) +#define VSALVSYNC_LOCK \ + assert(pthread_mutex_lock(&vol_salvsync_mutex) == 0) +#define VSALVSYNC_UNLOCK \ + assert(pthread_mutex_unlock(&vol_salvsync_mutex) == 0) #define VTRANS_LOCK \ assert(pthread_mutex_lock(&vol_trans_mutex) == 0) #define VTRANS_UNLOCK \ assert(pthread_mutex_unlock(&vol_trans_mutex) == 0) #else /* AFS_PTHREAD_ENV */ -#define VATTACH_LOCK -#define VATTACH_UNLOCK #define VOL_LOCK #define VOL_UNLOCK -#define VFSYNC_LOCK -#define VFSYNC_UNLOCK +#define VSALVSYNC_LOCK +#define VSALVSYNC_UNLOCK #define VTRANS_LOCK #define VTRANS_UNLOCK #endif /* AFS_PTHREAD_ENV */ -typedef enum { fileServer, volumeUtility, salvager } ProgramType; +typedef enum { fileServer, /* the fileserver process */ + volumeUtility, /* volserver, or a single volume salvager (non-dafs) */ + salvager, /* standalone whole-partition salvager */ + salvageServer, /* dafs online salvager */ + debugUtility /* fssync-debug or similar utility */ +} ProgramType; extern ProgramType programType; /* The type of program using the package */ /* Some initialization parameters for the volume package */ @@ -76,6 +78,70 @@ struct versionStamp { /* Version stamp for critical volume files */ * that created this file */ }; +#ifdef AFS_DEMAND_ATTACH_FS +/* + * demand attach fs + * volume state machine + * + * these must be contiguous in order for IsValidState() to work correctly + */ +#define VOL_STATE_UNATTACHED 0 /* volume is unattached */ +#define VOL_STATE_PREATTACHED 1 /* volume has been pre-attached */ +#define VOL_STATE_ATTACHING 2 /* volume is transitioning to fully attached */ +#define VOL_STATE_ATTACHED 3 /* volume has been fully attached */ +#define VOL_STATE_UPDATING 4 /* volume is updating on-disk structures */ +#define VOL_STATE_GET_BITMAP 5 /* volume is getting bitmap entries */ +#define VOL_STATE_HDR_LOADING 6 /* volume is loading disk header */ +#define VOL_STATE_HDR_ATTACHING 7 /* volume is getting a header from the LRU */ +#define VOL_STATE_SHUTTING_DOWN 8 /* volume is shutting down */ +#define VOL_STATE_GOING_OFFLINE 9 /* volume is going offline */ +#define VOL_STATE_OFFLINING 10 /* volume is transitioning to offline */ +#define VOL_STATE_DETACHING 11 /* volume is transitioning to detached */ +#define VOL_STATE_SALVSYNC_REQ 12 /* volume is blocked on a salvsync request */ +#define VOL_STATE_SALVAGING 13 /* volume is being salvaged */ +#define VOL_STATE_ERROR 14 /* volume is in an error state */ +#define VOL_STATE_FREED 15 /* debugging aid */ + +#define VOL_STATE_COUNT 16 /* total number of valid states */ + +/* V_attachFlags bits */ +#define VOL_HDR_ATTACHED 0x1 /* volume header is attached to Volume struct */ +#define VOL_HDR_LOADED 0x2 /* volume header contents are valid */ +#define VOL_HDR_IN_LRU 0x4 /* volume header is in LRU */ +#define VOL_IN_HASH 0x8 /* volume is in hash table */ +#define VOL_ON_VBYP_LIST 0x10 /* volume is on VByP list */ +#define VOL_IS_BUSY 0x20 /* volume is not to be free()d */ +#define VOL_ON_VLRU 0x40 /* volume is on the VLRU */ +#define VOL_HDR_DONTSALV 0x80 /* volume header DONTSALVAGE flag is set */ + +/* VPrintExtendedCacheStats flags */ +#define VOL_STATS_PER_CHAIN 0x1 /* compute simple per-chain stats */ +#define VOL_STATS_PER_CHAIN2 0x2 /* compute per-chain stats that require scanning + * every element of the chain */ + +/* VLRU_SetOptions options */ +#define VLRU_SET_THRESH 1 +#define VLRU_SET_INTERVAL 2 +#define VLRU_SET_MAX 3 +#define VLRU_SET_ENABLED 4 + +/* valid VLRU queue names */ +#define VLRU_QUEUE_NEW 0 /* LRU queue for new volumes */ +#define VLRU_QUEUE_MID 1 /* survivor generation */ +#define VLRU_QUEUE_OLD 2 /* old generation */ +#define VLRU_QUEUE_CANDIDATE 3 /* soft detach candidate pool */ +#define VLRU_QUEUE_HELD 4 /* volumes which are not allowed + * to be soft detached */ +#define VLRU_QUEUE_INVALID 5 /* invalid queue id */ + +/* default scanner timing parameters */ +#define VLRU_DEFAULT_OFFLINE_THRESH (60*60*2) /* 2 hours */ +#define VLRU_DEFAULT_OFFLINE_INTERVAL (60*2) /* 2 minutes */ +#define VLRU_DEFAULT_OFFLINE_MAX 8 /* 8 volumes */ + +#endif /* AFS_DEMAND_ATTACH_FS */ + + /* Magic numbers and version stamps for each type of file */ #define VOLUMEHEADERMAGIC ((bit32)0x88a1bb3c) #define VOLUMEINFOMAGIC ((bit32)0x78a1b2c5) @@ -297,8 +363,144 @@ typedef struct VolumeDiskData { /**************************************/ /* Memory resident volume information */ /**************************************/ + +/* global volume package stats */ +typedef struct VolPkgStats { +#ifdef AFS_DEMAND_ATTACH_FS + /* + * demand attach fs + * extended volume package statistics + */ + + /* levels */ + afs_uint32 state_levels[VOL_STATE_COUNT]; + + /* counters */ + afs_uint64 hash_looks; /* number of hash chain element traversals */ + afs_uint64 hash_reorders; /* number of hash chain reorders */ + afs_uint64 salvages; /* online salvages since fileserver start */ + afs_uint64 vol_ops; /* volume operations since fileserver start */ +#endif /* AFS_DEMAND_ATTACH_FS */ + + afs_uint64 hdr_loads; /* header loads from disk */ + afs_uint64 hdr_gets; /* header pulls out of LRU */ + afs_uint64 attaches; /* volume attaches since fileserver start */ + afs_uint64 soft_detaches; /* soft detach ops since fileserver start */ + + /* configuration parameters */ + afs_uint32 hdr_cache_size; /* size of volume header cache */ +} VolPkgStats; +extern VolPkgStats VStats; + +/* + * volume header cache supporting structures + */ +#ifdef AFS_DEMAND_ATTACH_FS +struct volume_hdr_LRU_stats { + afs_uint32 free; + afs_uint32 used; + afs_uint32 attached; +}; +#endif + +struct volume_hdr_LRU_t { + struct rx_queue lru; +#ifdef AFS_DEMAND_ATTACH_FS + struct volume_hdr_LRU_stats stats; +#endif +}; +extern struct volume_hdr_LRU_t volume_hdr_LRU; + +/* + * volume hash chain supporting structures + */ +typedef struct VolumeHashChainHead { + struct rx_queue queue; + int len; + /* someday we could put a per-chain lock here... */ +#ifdef AFS_DEMAND_ATTACH_FS + int busy; + int cacheCheck; + + /* per-chain statistics */ + afs_uint64 looks; + afs_uint64 gets; + afs_uint64 reorders; + + pthread_cond_t chain_busy_cv; +#endif /* AFS_DEMAND_ATTACH_FS */ +} VolumeHashChainHead; + +typedef struct VolumeHashTable { + int Size; + int Mask; + VolumeHashChainHead * Table; +} VolumeHashTable_t; +extern VolumeHashTable_t VolumeHashTable; + +struct VolumeHashChainStats { + afs_int32 table_size; + afs_int32 chain_len; +#ifdef AFS_DEMAND_ATTACH_FS + afs_int32 chain_cacheCheck; + afs_int32 chain_busy; + afs_uint64 chain_looks; + afs_uint64 chain_gets; + afs_uint64 chain_reorders; +#endif +}; + + +#ifdef AFS_DEMAND_ATTACH_FS +/* demand attach fs + * extended per-volume statistics + * + * please note that this structure lives across the entire + * lifetime of the fileserver process + */ +typedef struct VolumeStats { + /* counters */ + afs_uint64 hash_lookups; /* hash table lookups */ + afs_uint64 hash_short_circuits; /* short circuited hash lookups (due to cacheCheck) */ + afs_uint64 hdr_loads; /* header loads from disk */ + afs_uint64 hdr_gets; /* header pulls out of LRU */ + afs_uint16 attaches; /* attaches of this volume since fileserver start */ + afs_uint16 soft_detaches; /* soft detaches of this volume */ + afs_uint16 salvages; /* online salvages since fileserver start */ + afs_uint16 vol_ops; /* volume operations since fileserver start */ + + /* timestamps */ + afs_uint32 last_attach; /* unix timestamp of last VAttach */ + afs_uint32 last_get; /* unix timestamp of last VGet/VHold */ + afs_uint32 last_promote; /* unix timestamp of last VLRU promote/demote */ + afs_uint32 last_hdr_get; /* unix timestamp of last GetVolumeHeader() */ + afs_uint32 last_salvage; /* unix timestamp of last initiation of an online salvage */ + afs_uint32 last_salvage_req; /* unix timestamp of last SALVSYNC request */ + afs_uint32 last_vol_op; /* unix timestamp of last volume operation */ +} VolumeStats; + +/* demand attach fs + * online salvager state */ +typedef struct VolumeOnlineSalvage { + afs_uint32 prio; /* number of VGetVolume's since salvage requested */ + int reason; /* reason for requesting online salvage */ + byte requested; /* flag specifying that salvage should be scheduled */ + byte scheduled; /* flag specifying whether online salvage scheduled */ + byte reserved[2]; /* padding */ +} VolumeOnlineSalvage; + +/* demand attach fs + * volume LRU state */ +typedef struct VolumeVLRUState { + struct rx_queue lru; /* VLRU queue pointers */ + int idx; /* VLRU generation index */ +} VolumeVLRUState; + +typedef afs_uint16 VolState; /* attachment state type */ +#endif /* AFS_DEMAND_ATTACH_FS */ + typedef struct Volume { - struct Volume *hashNext; /* Next in hash resolution table */ + struct rx_queue q; /* Volume hash chain pointers */ VolumeId hashid; /* Volume number -- for hash table lookup */ struct volHeader *header; /* Cached disk data */ Device device; /* Unix device for the volume */ @@ -339,10 +541,23 @@ typedef struct Volume { afs_uint32 updateTime; /* Time that this volume was put on the updated * volume list--the list of volumes that will be * salvaged should the file server crash */ +#ifdef AFS_DEMAND_ATTACH_FS + VolState attach_state; /* what stage of attachment has been completed */ + afs_uint16 attach_flags; /* flags related to attachment state */ + pthread_cond_t attach_cv; /* state change condition variable */ + short nWaiters; /* volume package internal ref count */ + int chainCacheCheck; /* Volume hash chain cache check */ + struct rx_queue vol_list; /* per-partition volume list (VByPList) */ + + VolumeOnlineSalvage salvage; /* online salvager state */ + VolumeStats stats; /* per-volume statistics */ + VolumeVLRUState vlru; /* state specific to the VLRU */ + FSSYNC_VolOp_info * pending_vol_op; /* fssync command info for any pending vol ops */ +#endif /* AFS_DEMAND_ATTACH_FS */ } Volume; struct volHeader { - struct volHeader *prev, *next; /* LRU pointers */ + struct rx_queue lru; VolumeDiskData diskstuff; /* General volume info read from disk */ Volume *back; /* back pointer to current volume structure */ }; @@ -356,6 +571,11 @@ struct volHeader { #define V_vnodeIndex(vp) ((vp)->vnodeIndex) #define V_nextVnodeUnique(vp) ((vp)->nextVnodeUnique) #define V_linkHandle(vp) ((vp)->linkHandle) +#ifdef AFS_DEMAND_ATTACH_FS +#define V_attachState(vp) ((vp)->attach_state) +#define V_attachFlags(vp) ((vp)->attach_flags) +#define V_attachCV(vp) ((vp)->attach_cv) +#endif /* AFS_DEMAND_ATTACH_FS */ /* N.B. V_id must be this, rather than vp->id, or some programs will break, probably */ #define V_stamp(vp) ((vp)->header->diskstuff.stamp) @@ -414,7 +634,7 @@ struct volHeader { extern char *VSalvageMessage; /* Canonical message when a volume is forced * offline */ -extern Volume *VGetVolume(Error * ec, VolId volumeId); +extern Volume *VGetVolume(Error * ec, Error * client_ec, VolId volumeId); extern Volume *VGetVolume_r(Error * ec, VolId volumeId); extern void VPutVolume(Volume *); extern void VPutVolume_r(Volume *); @@ -422,6 +642,9 @@ extern void VOffline(Volume * vp, char *message); extern void VOffline_r(Volume * vp, char *message); extern int VConnectFS(void); extern int VConnectFS_r(void); +extern void VDisconnectFS(void); +extern void VDisconnectFS_r(void); +extern int VChildProcReconnectFS(void); extern Volume *VAttachVolume(Error * ec, VolumeId volumeId, int mode); extern Volume *VAttachVolume_r(Error * ec, VolumeId volumeId, int mode); extern Volume *VCreateVolume(Error * ec, char *partname, VolId volumeId, @@ -431,7 +654,7 @@ extern Volume *VCreateVolume_r(Error * ec, char *partname, VolId volumeId, extern VnodeId VAllocBitmapEntry(Error * ec, Volume * vp, struct vnodeIndex *index); extern VnodeId VAllocBitmapEntry_r(Error * ec, Volume * vp, - struct vnodeIndex *index); + struct vnodeIndex *index, int flags); extern void VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index, unsigned bitNumber); extern void VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index, @@ -444,13 +667,13 @@ extern Volume *VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode); extern void VShutdown(void); extern void VUpdateVolume(Error * ec, Volume * vp); -extern void VUpdateVolume_r(Error * ec, Volume * vp); +extern void VUpdateVolume_r(Error * ec, Volume * vp, int flags); extern void VAddToVolumeUpdateList(Error * ec, Volume * vp); extern void VAddToVolumeUpdateList_r(Error * ec, Volume * vp); extern void VDetachVolume(Error * ec, Volume * vp); extern void VDetachVolume_r(Error * ec, Volume * vp); extern void VForceOffline(Volume * vp); -extern void VForceOffline_r(Volume * vp); +extern void VForceOffline_r(Volume * vp, int flags); extern void VBumpVolumeUsage(register Volume * vp); extern void VBumpVolumeUsage_r(register Volume * vp); extern void VSetDiskUsage(void); @@ -459,12 +682,41 @@ extern void VReleaseVnodeFiles_r(Volume * vp); extern void VCloseVnodeFiles_r(Volume * vp); extern struct DiskPartition *VGetPartition(char *name, int abortp); extern struct DiskPartition *VGetPartition_r(char *name, int abortp); -extern int VInitVolumePackage(ProgramType pt, int nLargeVnodes, - int nSmallVnodes, int connect, int volcache); +extern int VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, + afs_uint32 nSmallVnodes, int connect, afs_uint32 volcache); extern void DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh); extern void VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h); extern void VTakeOffline_r(register Volume * vp); extern void VTakeOffline(register Volume * vp); +extern Volume * VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint); + +#ifdef AFS_DEMAND_ATTACH_FS +extern Volume *VPreAttachVolumeByName(Error * ec, char *partition, char *name, + int mode); +extern Volume *VPreAttachVolumeByName_r(Error * ec, char *partition, char *name, + int mode); +extern Volume *VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, + Volume * vp, int volume_id); +extern Volume *VGetVolumeByVp_r(Error * ec, Volume * vp); +extern int VShutdownByPartition_r(struct DiskPartition * dp); +extern int VShutdownVolume_r(Volume * vp); +extern int VConnectSALV(void); +extern int VConnectSALV_r(void); +extern int VReconnectSALV(void); +extern int VReconnectSALV_r(void); +extern int VDisconnectSALV(void); +extern int VDisconnectSALV_r(void); +extern void VPrintExtendedCacheStats(int flags); +extern void VPrintExtendedCacheStats_r(int flags); +extern VolState VChangeState_r(Volume * vp, VolState new_state); +extern void VLRU_SetOptions(int option, afs_uint32 val); +extern int VSetVolHashSize(int logsize); +extern int VRequestSalvage_r(Volume * vp, int reason, int flags); +extern int VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo); +extern int VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo); +#endif /* AFS_DEMAND_ATTACH_FS */ +extern int VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo); +extern int VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo); /* Naive formula relating number of file size to number of 1K blocks in file */ @@ -500,6 +752,26 @@ extern void VTakeOffline(register Volume * vp); * getting the most recent data. */ + +/* VUpdateVolume_r flags */ +#define VOL_UPDATE_WAIT 0x1 /* for demand attach, wait for other exclusive ops to end */ +#define VOL_UPDATE_NOFORCEOFF 0x2 /* don't force offline on failure. this is to prevent + * infinite recursion between vupdate and vforceoff */ + +/* VForceOffline_r flags */ +#define VOL_FORCEOFF_NOUPDATE 0x1 /* don't force update on forceoff. this is to prevent + * infinite recursion between vupdate and vforceoff */ + +/* VSyncVolume_r flags */ +#define VOL_SYNC_WAIT 0x1 /* for demand attach, wait for other exclusive ops to end */ + +/* VAllocBitmapEntry_r flags */ +#define VOL_ALLOC_BITMAP_WAIT 0x1 /* for demand attach, wait for other exclusive ops to end */ + +/* VRequestSalvage_r flags */ +#define VOL_SALVAGE_INVALIDATE_HEADER 0x1 /* for demand attach fs, invalidate volume header cache */ + + #if defined(NEARINODE_HINT) #define V_pref(vp,nearInode) nearInodeHash(V_id(vp),(nearInode)); (nearInode) %= V_partition(vp)->f_files #else diff --git a/src/volser/NTMakefile b/src/volser/NTMakefile index 5e6fa35e93..ded4d73634 100644 --- a/src/volser/NTMakefile +++ b/src/volser/NTMakefile @@ -5,6 +5,8 @@ # License. For details, see the LICENSE file in the top-level source # directory or online at http://www.openafs.org/dl/license10.html +AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_CLIENT + RELDIR=volser !INCLUDE ..\config\NTMakefile.$(SYS_NAME) !INCLUDE ..\config\NTMakefile.version diff --git a/src/volser/dumpstuff.c b/src/volser/dumpstuff.c index 911c35ae44..fc16c52793 100644 --- a/src/volser/dumpstuff.c +++ b/src/volser/dumpstuff.c @@ -51,6 +51,7 @@ RCSID #include #include #include "dump.h" +#include #include #include #include "volser.h" diff --git a/src/volser/volprocs.c b/src/volser/volprocs.c index ae1664fd27..5bba7c10f5 100644 --- a/src/volser/volprocs.c +++ b/src/volser/volprocs.c @@ -61,6 +61,7 @@ RCSID #include #include #include "vol.h" +#include #include #include #include "afs/audit.h" @@ -844,7 +845,7 @@ VolReClone(struct rx_call *acid, afs_int32 atrans, afs_int32 cloneId) { struct DiskPartition *tpartp = originalvp->partition; - FSYNC_askfs(cloneId, tpartp->name, FSYNC_RESTOREVOLUME, 0); + FSYNC_VolOp(cloneId, tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL); } return 0; @@ -1355,8 +1356,7 @@ VolRestore(struct rx_call *acid, afs_int32 atrans, afs_int32 aflags, DFlushVolume(V_parentId(tt->volume)); /* Ensure dir buffers get dropped */ code = RestoreVolume(acid, tt->volume, (aflags & 1), cookie); /* last is incrementalp */ - FSYNC_askfs(tt->volid, NULL, FSYNC_RESTOREVOLUME, 0l); /*break call backs on the - * restored volume */ + FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_BREAKCBKS, 0l, NULL); tt->rxCallPtr = (struct rx_call *)0; tcode = TRELE(tt); @@ -1422,7 +1422,7 @@ VolSetForwarding(struct rx_call *acid, afs_int32 atid, afs_int32 anewsite) } strcpy(tt->lastProcName, "SetForwarding"); tt->rxCallPtr = acid; - FSYNC_askfs(tt->volid, NULL, FSYNC_MOVEVOLUME, anewsite); + FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_MOVE, anewsite, NULL); tt->rxCallPtr = (struct rx_call *)0; if (TRELE(tt)) return VOLSERTRELE_ERROR; @@ -1672,6 +1672,9 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries) /* Only report attached partitions */ for (i = 0; i < VOLMAXPARTS; i++) { +#ifdef AFS_DEMAND_ATTACH_FS + dp = VGetPartitionById(i, 0); +#else if (i < 26) { namehead[6] = i + 'a'; namehead[7] = '\0'; @@ -1682,6 +1685,7 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries) namehead[8] = '\0'; } dp = VGetPartition(namehead, 0); +#endif if (dp) partList.partId[j++] = i; } @@ -1792,7 +1796,7 @@ VolListOneVolume(struct rx_call *acid, afs_int32 partid, afs_int32 pntr->volid = volid; goto drop; } - tv = VAttachVolumeByName(&error, pname, volname, V_READONLY); + tv = VAttachVolumeByName(&error, pname, volname, V_PEEK); if (error) { pntr->status = 0; /*things are messed up */ strcpy(pntr->name, volname); @@ -2007,7 +2011,7 @@ VolXListOneVolume(struct rx_call *a_rxCidP, afs_int32 a_partID, /* * Attach the volume, give up on the volume if we can't. */ - tv = VAttachVolumeByName(&error, pname, volname, V_READONLY); + tv = VAttachVolumeByName(&error, pname, volname, V_PEEK); if (error) { xInfoP->status = 0; /*things are messed up */ strcpy(xInfoP->name, volname); @@ -2819,7 +2823,7 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId, return EIO; } close(fd); - FSYNC_askfs(volumeId, pname, FSYNC_RESTOREVOLUME, 0); + FSYNC_VolOp(volumeId, pname, FSYNC_VOL_BREAKCBKS, 0, NULL); for (dp = DiskPartitionList; dp && strcmp(dp->name, pname); dp = dp->next); @@ -2854,8 +2858,8 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId, if (unlink(opath) < 0) { Log("1 SAFS_VolConvertROtoRWvolume: Couldn't unlink RO header, error = %d\n", error); } - FSYNC_askfs(volumeId, pname, FSYNC_DONE, 0); - FSYNC_askfs(h.id, pname, FSYNC_ON, 0); + FSYNC_VolOp(volumeId, pname, FSYNC_VOL_DONE, 0, NULL); + FSYNC_VolOp(h.id, pname, FSYNC_VOL_ON, 0, NULL); return 0; #else /* AFS_NAMEI_ENV */ return EINVAL; diff --git a/src/volser/volser.p.h b/src/volser/volser.p.h index 9e5b015c7c..e0111f0e4a 100644 --- a/src/volser/volser.p.h +++ b/src/volser/volser.p.h @@ -15,6 +15,8 @@ #include #endif +#include + /* vflags, representing state of the volume */ #define VTDeleteOnSalvage 1 /* delete on next salvage */ #define VTOutOfService 2 /* never put this volume online */ @@ -110,7 +112,6 @@ extern struct volser_trans *QI_GlobalWriteTrans; #define INVALID_BID 0 #define VOLSER_MAXVOLNAME 65 #define VOLSER_OLDMAXVOLNAME 32 -#define VOLMAXPARTS 255 /*flags used for interfacing with the backup system */ struct volDescription { /*used for interfacing with the backup system */